import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
pd.set_option('future.no_silent_downcasting', True)
data = pd.read_csv(r'data/data.csv', sep='|', dtype={'client_id':int},skipfooter = 1, engine='python')
data.shape
(238615, 77)
[x for x in data.columns if x.startswith('Saving')]
['SavingAccount_Active_ARG_Salary', 'SavingAccount_Active_ARG', 'SavingAccount_Active_DOLLAR', 'SavingAccount_Balance_FirstDate', 'SavingAccount_Balance_LastDate', 'SavingAccount_Balance_Average', 'SavingAccount_Days_with_use', 'SavingAccount_Days_with_Credits', 'SavingAccount_Days_with_Debits', 'SavingAccount_Salary_Payment_Transactions', 'SavingAccount_Transfer_In_Transactions', 'SavingAccount_ATM_Extraction_Transactions', 'SavingAccount_Service_Payment_Transactions', 'SavingAccount_CreditCard_Payment_Transactions', 'SavingAccount_Transfer_Out_Transactions', 'SavingAccount_DebitCard_Spend_Transactions', 'SavingAccount_Transactions_Transactions', 'SavingAccount_Credits_Transactions', 'SavingAccount_Debits_Transactions', 'SavingAccount_Salary_Payment_Amount', 'SavingAccount_Transfer_In_Amount', 'SavingAccount_ATM_Extraction_Amount', 'SavingAccount_Service_Payment_Amount', 'SavingAccount_CreditCard_Payment_Amount', 'SavingAccount_Transfer_Out_Amount', 'SavingAccount_DebitCard_Spend_Amount', 'SavingAccount_Total_Amount', 'SavingAccount_Credits_Amounts', 'SavingAccount_Debits_Amounts']
data.Month.value_counts()
Month 2019-04-01 26547 2019-03-01 26547 2019-02-01 26512 2019-01-01 26503 2018-08-01 26502 2018-10-01 26501 2018-09-01 26501 2018-12-01 26501 2018-11-01 26501 Name: count, dtype: int64
len(data.client_id.unique())
26560
#clientes en la base 26560
data_9m = pd.DataFrame(data.client_id.value_counts().reset_index())
data_9m.columns = ['client_id','cantidad_meses']
data_9m
| client_id | cantidad_meses | |
|---|---|---|
| 0 | 5856970 | 9 |
| 1 | 5895899 | 9 |
| 2 | 4712252 | 9 |
| 3 | 7304330 | 9 |
| 4 | 6657428 | 9 |
| ... | ... | ... |
| 26555 | 6623284 | 1 |
| 26556 | 4424661 | 1 |
| 26557 | 5643352 | 1 |
| 26558 | 6641590 | 1 |
| 26559 | 264018 | 1 |
26560 rows × 2 columns
data_9m[data_9m.cantidad_meses == 9].shape
(26483, 2)
#me quedo con los de 9 meses
data_9m = data_9m[data_9m.cantidad_meses == 9].copy()
#sin paquetes en el ultimo mes
data.Package_Active.value_counts()
Package_Active No 234177 Yes 4438 Name: count, dtype: int64
#tenemos 9 meses, pero usaremos 2 para la prediccion, 1 para lead window y el resto pata el entrenamiento
data_sin_paquete = data[(data.Package_Active == 'No') & (data.Month == '2019-01-01')][['client_id']]
data_sin_paquete.shape
(26026, 1)
#condicion comercial, sera que el cliente tenga cobranding( que es lo de coto, cencusud, etc)
len(data[(data.CreditCard_CoBranding == 'Yes') & (data.Month == '2019-01-01')].client_id.unique())
2843
#prediction Window
data.Target.value_counts()
Target 0.0 176359 1.0 62256 Name: count, dtype: int64
data_Target = data[(data.Target == 1) & (data.Month.isin(['2019-04-01','2019-03-01']))][['client_id']].drop_duplicates()
data_Target.shape
data_Target['TGT'] = 1
data_cobranding = data[(data.CreditCard_CoBranding == 'Yes') & (data.Month == '2019-01-01')][['client_id']]
data_cobranding.shape
(2843, 1)
data_cruce_cobranding = data_cobranding.merge(data_Target, how='left', on='client_id').fillna(0)
data_cruce_cobranding.TGT.value_counts()
TGT 0.0 2836 1.0 7 Name: count, dtype: int64
# como vemos, los que tienen cobranding y con target 1, son solo 7. Esos no los voy a usar, son muy pocos, ademas
# hay una constraint de negocio de los bancos, que dice que a los que tienen cobranding, no les doy paquetes
#porque no tienen mucha plata
## asi que lo que necesitaria es para cumplir el objetivo de "Vender paquetes" es:
# Cruzar todo y quedarse con los clientes aptos:
# 9 mese de data (no le vendo a los clientes nuevos)
# sin cobranding (porque si tiene cobranding se que no les voy a vender a ellos)
# sin paquete activo (no le voy a vender un producto que ya tiene)
data_cobranding_No = data[(data.CreditCard_CoBranding == 'No') & (data.Month == '2019-01-01')][['client_id']]
data_cobranding_No
| client_id | |
|---|---|
| 2 | 5928737 |
| 10 | 6018047 |
| 11 | 5359038 |
| 16 | 6890812 |
| 20 | 115383 |
| ... | ... |
| 238573 | 6570413 |
| 238574 | 6258895 |
| 238585 | 6397274 |
| 238586 | 6007291 |
| 238612 | 6412619 |
23660 rows × 1 columns
universo = data_9m.merge(data_sin_paquete, how='inner', on='client_id')\
.merge(data_cobranding_No, how='inner', on='client_id')\
.merge(data_Target, how='left', on='client_id').fillna(0)
universo.TGT.value_counts()
TGT 0.0 16368 1.0 6823 Name: count, dtype: int64
#en nuestro universo el 30% de las personas compran paquetes
Exploratory Data Analysis¶
training_window = data[(data.Month.isin(['2018-08-01','2018-09-01','2018-10-01','2018-11-01','2018-12-01','2019-01-01']))]
training_window.shape
(159009, 77)
training_window
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_External | CreditCard_Payment_Cash | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5856970 | 1.0 | 2018-10-01 | 2013-10-23 | 2019-01-10 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | Yes | Yes | NaN | NaN |
| 1 | 6371753 | 0.0 | 2018-09-01 | 2015-07-29 | 2018-06-02 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | NaN | NaN |
| 2 | 5928737 | 0.0 | 2019-01-01 | 2016-08-31 | 2018-12-27 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | NaN | NaN |
| 3 | 475064 | 0.0 | 2018-12-01 | 2014-07-13 | 2017-11-30 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | NaN | NaN |
| 4 | 3615172 | 0.0 | 2018-09-01 | 2017-12-27 | 2017-12-28 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 238597 | 1673642 | 0.0 | 2018-11-01 | 2017-08-18 | 2017-09-26 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | No | Yes | NaN | NaN |
| 238603 | 6145735 | 1.0 | 2018-11-01 | 2014-10-26 | 2014-10-26 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | NaN | NaN |
| 238604 | 5638786 | 1.0 | 2018-11-01 | 2012-12-26 | 2017-03-08 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | Yes | No | NaN | NaN |
| 238608 | 3824781 | 0.0 | 2018-08-01 | 2014-11-27 | 2019-01-04 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | NaN | NaN |
| 238612 | 6412619 | 0.0 | 2019-01-01 | 2015-07-08 | 2018-06-02 | No | No | No | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | NaN | NaN |
159009 rows × 77 columns
training_window.client_id.value_counts()
client_id
5856970 6
3852147 6
1129478 6
711456 6
2531821 6
..
6264756 2
5124642 1
6623284 1
1419642 1
6858355 1
Name: count, Length: 26509, dtype: int64
#para quitar esos clientes que no tienen 6 registros,
#cruzare mi training_window(estos serian 6 meses) con el universo de trabajo(estos son nueve meses)
training_window = training_window.merge(universo, how='inner', on='client_id')
training_window.client_id.value_counts()
client_id
5856970 6
2428341 6
6454439 6
5678167 6
7308381 6
..
6210931 6
2758381 6
6348905 6
5982253 6
5967858 6
Name: count, Length: 23191, dtype: int64
Identity features del ultimo mes de training window¶
#crear data frame con IF
dfIdentityFeatures = training_window[training_window.Month == '2019-01-01'][['client_id','Target','Investment_Numbers','CreditCard_Total_Limit']]
dfIdentityFeatures
| client_id | Target | Investment_Numbers | CreditCard_Total_Limit | |
|---|---|---|---|---|
| 2 | 5928737 | 0.0 | 0.0 | 0.0 |
| 8 | 6018047 | 1.0 | 0.0 | 80000.0 |
| 9 | 5359038 | 1.0 | 0.0 | 64000.0 |
| 11 | 6890812 | 0.0 | 0.0 | 0.0 |
| 13 | 115383 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... |
| 139131 | 6570413 | 0.0 | 0.0 | 28000.0 |
| 139132 | 6258895 | 0.0 | 0.0 | 0.0 |
| 139138 | 6397274 | 0.0 | 0.0 | 40000.0 |
| 139139 | 6007291 | 0.0 | 0.0 | 96000.0 |
| 139145 | 6412619 | 0.0 | 0.0 | 0.0 |
23191 rows × 4 columns
Transform Features del ultimo mes de training window¶
dfTransformFeatures = training_window[training_window.Month == '2019-01-01'][['client_id']]
columnas = ['CreditCard_Premium','CreditCard_Active','CreditCard_CoBranding','Loan_Active',
'Mortgage_Active', 'SavingAccount_Active_ARG_Salary','SavingAccount_Active_ARG','SavingAccount_Active_DOLLAR'
,'DebitCard_Active','Investment_Active','Package_Active','Insurance_Life'
,'Insurance_Home','Insurance_Accidents','Insurance_Mobile','Insurance_ATM','Insurance_Unemployment']
for columna in columnas:
dfTransformFeatures[columna] = np.where(training_window[training_window.Month == '2019-01-01'][columna] == 'Yes',1,0)
#----------
dfTransformFeatures['Sex'] = np.where(training_window[training_window.Month == '2019-01-01']['Sex'] == 'F', 0, 1)
#--------
TiposSegurosColumnas = [x for x in training_window.columns if x.startswith('Insurance')]
dfInsurance = training_window[training_window.Month == '2019-01-01'][['client_id']]
for columna in TiposSegurosColumnas:
dfInsurance[columna] = np.where(training_window[training_window.Month == '2019-01-01'][columna] == 'Yes',1,0)
dfTransformFeatures['Total_Seguros_del_UltimosMes'] = dfInsurance[TiposSegurosColumnas[0]] + dfInsurance[TiposSegurosColumnas[1]] + dfInsurance[TiposSegurosColumnas[2]] +dfInsurance[TiposSegurosColumnas[3]] +dfInsurance[TiposSegurosColumnas[4]]+dfInsurance[TiposSegurosColumnas[5]]
#---------------
dfTransformFeatures['Client_Age_grp'] = training_window[training_window.Month == '2019-01-01'][['Client_Age_grp']]
di = {
"Entre 40 y 49 años" : 40,
"Entre 30 y 39 años" : 30,
"Entre 50 y 59 años" : 50,
"Entre 60 y 64 años" : 60,
"Entre 65 y 69 años" : 65,
"Entre 18 y 29 años" : 18,
"Mayor a 70 años" : 70,
"Menor a 18 años" : 17,
}
dfTransformFeatures.Client_Age_grp = dfTransformFeatures.Client_Age_grp.map(di)
#-----------------
#dfTransformFeatures['MaxSavingAccount_ATM_Extraction_Last6'] = tr
# total cantidad de seguros del ultimo mes, grupos de edades ponerlos en numeros, maxima extraccion de atm de los ultmos 6 meses
dfTransformFeatures
| client_id | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | SavingAccount_Active_ARG_Salary | SavingAccount_Active_ARG | SavingAccount_Active_DOLLAR | DebitCard_Active | ... | Package_Active | Insurance_Life | Insurance_Home | Insurance_Accidents | Insurance_Mobile | Insurance_ATM | Insurance_Unemployment | Sex | Total_Seguros_del_UltimosMes | Client_Age_grp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 5928737 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 30 |
| 8 | 6018047 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 60 |
| 9 | 5359038 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 40 |
| 11 | 6890812 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 40 |
| 13 | 115383 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 70 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 139131 | 6570413 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 30 |
| 139132 | 6258895 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 30 |
| 139138 | 6397274 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 40 |
| 139139 | 6007291 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 40 |
| 139145 | 6412619 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 60 |
23191 rows × 21 columns
Missing values¶
training_window.columns[training_window.isnull().any()].tolist()
['SavingAccount_Balance_Average', 'Region', 'CreditCard_Product']
Tratamiento missing: SavingAccount_Balance_Average¶
training_window[training_window.SavingAccount_Balance_Average.fillna(-999) == -999]
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | cantidad_meses | TGT | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 50026 | 4473325 | 0.0 | 2018-12-01 | 2008-05-27 | 2017-05-18 | Yes | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | NaN | NaN | 9 | 0.0 |
| 50027 | 4837071 | 1.0 | 2018-12-01 | 2013-02-21 | 2018-09-04 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | NaN | NaN | 9 | 1.0 |
| 50029 | 4525957 | 1.0 | 2019-01-01 | 2012-10-16 | 2016-08-31 | Yes | Yes | No | No | No | ... | 1.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | NaN | NaN | 9 | 1.0 |
| 139145 | 6412619 | 0.0 | 2019-01-01 | 2015-07-08 | 2018-06-02 | No | No | No | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | NaN | NaN | 9 | 0.0 |
4 rows × 79 columns
training_window[training_window.client_id == '6412619']['SavingAccount_Balance_Average']
Series([], Name: SavingAccount_Balance_Average, dtype: float64)
training_window['SavingAccount_Balance_Average'] = np.where(training_window.SavingAccount_Balance_Average.isnull(),
training_window.SavingAccount_Balance_FirstDate - training_window.SavingAccount_Debits_Amounts + training_window.SavingAccount_Credits_Amounts,
training_window.SavingAccount_Balance_Average)
training_window[training_window.client_id == '6412619']['SavingAccount_Balance_Average']
Series([], Name: SavingAccount_Balance_Average, dtype: float64)
Tratamiento missing: Region¶
training_window[training_window.Region.fillna('Empty') == 'Empty']
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | cantidad_meses | TGT | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5856970 | 1.0 | 2018-10-01 | 2013-10-23 | 2019-01-10 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 1.0 | Yes | Yes | NaN | NaN | 9 | 1.0 |
| 1 | 6371753 | 0.0 | 2018-09-01 | 2015-07-29 | 2018-06-02 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | NaN | NaN | 9 | 0.0 |
| 2 | 5928737 | 0.0 | 2019-01-01 | 2016-08-31 | 2018-12-27 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | NaN | NaN | 9 | 0.0 |
| 3 | 475064 | 0.0 | 2018-12-01 | 2014-07-13 | 2017-11-30 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | NaN | NaN | 9 | 0.0 |
| 4 | 3615172 | 0.0 | 2018-09-01 | 2017-12-27 | 2017-12-28 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | NaN | NaN | 9 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 139141 | 1673642 | 0.0 | 2018-11-01 | 2017-08-18 | 2017-09-26 | No | Yes | No | No | No | ... | 1.0 | 0.0 | 0.0 | 0.0 | No | Yes | NaN | NaN | 9 | 0.0 |
| 139142 | 6145735 | 1.0 | 2018-11-01 | 2014-10-26 | 2014-10-26 | No | Yes | No | No | No | ... | 1.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | NaN | NaN | 9 | 1.0 |
| 139143 | 5638786 | 1.0 | 2018-11-01 | 2012-12-26 | 2017-03-08 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 1.0 | 0.0 | Yes | No | NaN | NaN | 9 | 1.0 |
| 139144 | 3824781 | 0.0 | 2018-08-01 | 2014-11-27 | 2019-01-04 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | NaN | NaN | 9 | 0.0 |
| 139145 | 6412619 | 0.0 | 2019-01-01 | 2015-07-08 | 2018-06-02 | No | No | No | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | NaN | NaN | 9 | 0.0 |
139146 rows × 79 columns
data[data.Month == '2019-04-01']
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_External | CreditCard_Payment_Cash | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 5775560 | 0.0 | 2019-04-01 | 2013-08-22 | 2014-08-01 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION CENTRO | J55660104XX012 |
| 13 | 5800470 | 0.0 | 2019-04-01 | 2013-08-23 | 2018-03-26 | No | Yes | Yes | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION CENTRO | J55660123XX012 |
| 17 | 3540244 | 0.0 | 2019-04-01 | 2018-09-07 | 2018-09-07 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | No | No | REGION CENTRO | NaN |
| 43 | 6912865 | 0.0 | 2019-04-01 | 2017-08-17 | 2019-01-04 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | Yes | Yes | BUENOS AIRES | NaN |
| 47 | 6595044 | 0.0 | 2019-04-01 | 2016-01-14 | 2017-09-26 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION CENTRO | J55660104XX012 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 238564 | 5678167 | 1.0 | 2019-04-01 | 2013-02-26 | 2018-11-28 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | No | No | BUENOS AIRES | J55660104XX012 |
| 238589 | 6948039 | 1.0 | 2019-04-01 | 2017-08-14 | 2019-01-10 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION NORTE GRANDE ARGENTINO | J55660104XX012 |
| 238590 | 1818546 | 1.0 | 2019-04-01 | 2013-12-10 | 2018-02-07 | No | Yes | No | No | No | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION CUYO | J55660104XX012 |
| 238611 | 6377583 | 0.0 | 2019-04-01 | 2015-06-03 | 2019-01-04 | No | Yes | No | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | BUENOS AIRES | J55660104XX012 |
| 238613 | 5542402 | 0.0 | 2019-04-01 | 2012-09-13 | 2012-09-13 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | No | Yes | REGION NORTE GRANDE ARGENTINO | J55660104XX012 |
26547 rows × 77 columns
training_window.Region.value_counts()
Series([], Name: count, dtype: int64)
training_window['Region'].describe()
count 0 unique 0 top NaN freq NaN Name: Region, dtype: object
training_win_buffer = training_window.copy()
data_last_Moth = data[data.Month == '2019-04-01'][['client_id','Region']].copy()
data_last_Moth.rename(columns={'Region':'RegionUpdated'}, inplace=True)
training_win_buffer = training_win_buffer.merge(data_last_Moth, how='left', on='client_id')
training_window['Region'] = training_win_buffer['RegionUpdated']
training_window
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | cantidad_meses | TGT | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5856970 | 1.0 | 2018-10-01 | 2013-10-23 | 2019-01-10 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 1.0 | Yes | Yes | AMBA Resto | NaN | 9 | 1.0 |
| 1 | 6371753 | 0.0 | 2018-09-01 | 2015-07-29 | 2018-06-02 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION CENTRO | NaN | 9 | 0.0 |
| 2 | 5928737 | 0.0 | 2019-01-01 | 2016-08-31 | 2018-12-27 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION NORTE GRANDE ARGENTINO | NaN | 9 | 0.0 |
| 3 | 475064 | 0.0 | 2018-12-01 | 2014-07-13 | 2017-11-30 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION CUYO | NaN | 9 | 0.0 |
| 4 | 3615172 | 0.0 | 2018-09-01 | 2017-12-27 | 2017-12-28 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION CENTRO | NaN | 9 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 139141 | 1673642 | 0.0 | 2018-11-01 | 2017-08-18 | 2017-09-26 | No | Yes | No | No | No | ... | 1.0 | 0.0 | 0.0 | 0.0 | No | Yes | BUENOS AIRES | NaN | 9 | 0.0 |
| 139142 | 6145735 | 1.0 | 2018-11-01 | 2014-10-26 | 2014-10-26 | No | Yes | No | No | No | ... | 1.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION PATAGONICA | NaN | 9 | 1.0 |
| 139143 | 5638786 | 1.0 | 2018-11-01 | 2012-12-26 | 2017-03-08 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 1.0 | 0.0 | Yes | No | REGION NORTE GRANDE ARGENTINO | NaN | 9 | 1.0 |
| 139144 | 3824781 | 0.0 | 2018-08-01 | 2014-11-27 | 2019-01-04 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | BUENOS AIRES | NaN | 9 | 0.0 |
| 139145 | 6412619 | 0.0 | 2019-01-01 | 2015-07-08 | 2018-06-02 | No | No | No | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION PATAGONICA | NaN | 9 | 0.0 |
139146 rows × 79 columns
training_window['Region'].describe()
count 139110 unique 7 top BUENOS AIRES freq 41904 Name: Region, dtype: object
training_window.Region.value_counts()
Region BUENOS AIRES 41904 REGION CENTRO 27474 REGION NORTE GRANDE ARGENTINO 22044 REGION PATAGONICA 14592 CABA Centro/Norte 12288 AMBA Resto 10968 REGION CUYO 9840 Name: count, dtype: int64
Missing values Region que no se pudieron completar¶
training_window[training_window.Region.fillna('Empty') == 'Empty']
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | cantidad_meses | TGT | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5618 | 2181839 | 0.0 | 2018-08-01 | 2006-10-05 | 2006-10-05 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 12257 | 921872 | 0.0 | 2018-09-01 | 2005-11-24 | 2005-11-24 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 18700 | 2181839 | 0.0 | 2019-01-01 | 2006-10-05 | 2006-10-05 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 20460 | 436555 | 0.0 | 2018-11-01 | 2005-02-09 | 2005-02-09 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 25548 | 727081 | 0.0 | 2018-08-01 | 2005-07-18 | 2005-07-18 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 37210 | 535416 | 0.0 | 2018-09-01 | 2005-04-13 | 2005-04-13 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 37430 | 833913 | 0.0 | 2018-08-01 | 2005-08-31 | 2014-01-24 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 44841 | 436555 | 0.0 | 2018-08-01 | 2005-02-09 | 2005-02-09 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 53787 | 2181839 | 0.0 | 2018-09-01 | 2006-10-05 | 2006-10-05 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 55285 | 921872 | 0.0 | 2018-11-01 | 2005-11-24 | 2005-11-24 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 57045 | 436555 | 0.0 | 2018-10-01 | 2005-02-09 | 2005-02-09 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 62988 | 921872 | 0.0 | 2018-12-01 | 2005-11-24 | 2005-11-24 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 67184 | 833913 | 0.0 | 2019-01-01 | 2005-08-31 | 2014-01-24 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 77610 | 727081 | 0.0 | 2018-09-01 | 2005-07-18 | 2005-07-18 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 82529 | 2181839 | 0.0 | 2018-10-01 | 2006-10-05 | 2006-10-05 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 85105 | 436555 | 0.0 | 2018-09-01 | 2005-02-09 | 2005-02-09 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 87881 | 535416 | 0.0 | 2018-10-01 | 2005-04-13 | 2005-04-13 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 89371 | 2181839 | 0.0 | 2018-12-01 | 2006-10-05 | 2006-10-05 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 93734 | 833913 | 0.0 | 2018-11-01 | 2005-08-31 | 2014-01-24 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 93761 | 833913 | 0.0 | 2018-10-01 | 2005-08-31 | 2014-01-24 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 94231 | 2181839 | 0.0 | 2018-11-01 | 2006-10-05 | 2006-10-05 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 97122 | 727081 | 0.0 | 2018-11-01 | 2005-07-18 | 2005-07-18 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 99710 | 535416 | 0.0 | 2018-12-01 | 2005-04-13 | 2005-04-13 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 101314 | 535416 | 0.0 | 2019-01-01 | 2005-04-13 | 2005-04-13 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 101414 | 727081 | 0.0 | 2019-01-01 | 2005-07-18 | 2005-07-18 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 103617 | 727081 | 0.0 | 2018-10-01 | 2005-07-18 | 2005-07-18 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 105080 | 535416 | 0.0 | 2018-11-01 | 2005-04-13 | 2005-04-13 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 108222 | 833913 | 0.0 | 2018-09-01 | 2005-08-31 | 2014-01-24 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 109178 | 833913 | 0.0 | 2018-12-01 | 2005-08-31 | 2014-01-24 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 118693 | 921872 | 0.0 | 2018-10-01 | 2005-11-24 | 2005-11-24 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 120440 | 727081 | 0.0 | 2018-12-01 | 2005-07-18 | 2005-07-18 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 122807 | 436555 | 0.0 | 2018-12-01 | 2005-02-09 | 2005-02-09 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 130135 | 921872 | 0.0 | 2018-08-01 | 2005-11-24 | 2005-11-24 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 132195 | 921872 | 0.0 | 2019-01-01 | 2005-11-24 | 2005-11-24 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 135035 | 436555 | 0.0 | 2019-01-01 | 2005-02-09 | 2005-02-09 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
| 139104 | 535416 | 0.0 | 2018-08-01 | 2005-04-13 | 2005-04-13 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | No | No | NaN | NaN | 9 | 0.0 |
36 rows × 79 columns
data[(data.Month == '2019-04-01') & (data.client_id == '833913')]
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_External | CreditCard_Payment_Cash | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product |
|---|
0 rows × 77 columns
data[data.client_id == '833913']
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_External | CreditCard_Payment_Cash | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product |
|---|
0 rows × 77 columns
training_window['Region'] = np.where(training_window['Region'].isna(),
'BUENOS AIRES',
training_window['Region'])
training_window[training_window.Region.fillna('Empty') == 'Empty']
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | cantidad_meses | TGT |
|---|
0 rows × 79 columns
training_window.Region.value_counts()
Region BUENOS AIRES 41940 REGION CENTRO 27474 REGION NORTE GRANDE ARGENTINO 22044 REGION PATAGONICA 14592 CABA Centro/Norte 12288 AMBA Resto 10968 REGION CUYO 9840 Name: count, dtype: int64
Tratamiento missing: CreditCard_Product¶
training_window.CreditCard_Product.value_counts()
Series([], Name: count, dtype: int64)
training_window[training_window.CreditCard_Product.fillna('Empty') == 'Empty']
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | cantidad_meses | TGT | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5856970 | 1.0 | 2018-10-01 | 2013-10-23 | 2019-01-10 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 1.0 | Yes | Yes | AMBA Resto | NaN | 9 | 1.0 |
| 1 | 6371753 | 0.0 | 2018-09-01 | 2015-07-29 | 2018-06-02 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION CENTRO | NaN | 9 | 0.0 |
| 2 | 5928737 | 0.0 | 2019-01-01 | 2016-08-31 | 2018-12-27 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION NORTE GRANDE ARGENTINO | NaN | 9 | 0.0 |
| 3 | 475064 | 0.0 | 2018-12-01 | 2014-07-13 | 2017-11-30 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION CUYO | NaN | 9 | 0.0 |
| 4 | 3615172 | 0.0 | 2018-09-01 | 2017-12-27 | 2017-12-28 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION CENTRO | NaN | 9 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 139141 | 1673642 | 0.0 | 2018-11-01 | 2017-08-18 | 2017-09-26 | No | Yes | No | No | No | ... | 1.0 | 0.0 | 0.0 | 0.0 | No | Yes | BUENOS AIRES | NaN | 9 | 0.0 |
| 139142 | 6145735 | 1.0 | 2018-11-01 | 2014-10-26 | 2014-10-26 | No | Yes | No | No | No | ... | 1.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION PATAGONICA | NaN | 9 | 1.0 |
| 139143 | 5638786 | 1.0 | 2018-11-01 | 2012-12-26 | 2017-03-08 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 1.0 | 0.0 | Yes | No | REGION NORTE GRANDE ARGENTINO | NaN | 9 | 1.0 |
| 139144 | 3824781 | 0.0 | 2018-08-01 | 2014-11-27 | 2019-01-04 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | BUENOS AIRES | NaN | 9 | 0.0 |
| 139145 | 6412619 | 0.0 | 2019-01-01 | 2015-07-08 | 2018-06-02 | No | No | No | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION PATAGONICA | NaN | 9 | 0.0 |
139146 rows × 79 columns
training_window['CreditCard_Product'].describe()
count 0 unique 0 top NaN freq NaN Name: CreditCard_Product, dtype: object
CreditActive_clients = training_window[(training_window.CreditCard_Active == 'Yes') & (training_window.Month == '2019-01-01')]['client_id']
CreditCardProduct_buffer = data[(data.Month == '2019-04-01') & (data.client_id.isin(CreditActive_clients))][['client_id','CreditCard_Product']].copy()
CreditCardProduct_buffer.rename(columns={'CreditCard_Product':'CreditCard_ProductUpdated'}, inplace=True)
training_win_buffer2 = training_window.copy()
training_win_buffer2 = training_win_buffer2.merge(CreditCardProduct_buffer, how='left', on='client_id')
training_window['CreditCard_Product'] = training_win_buffer2['CreditCard_ProductUpdated']
training_window
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | cantidad_meses | TGT | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5856970 | 1.0 | 2018-10-01 | 2013-10-23 | 2019-01-10 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 1.0 | Yes | Yes | AMBA Resto | J55660202XX012 | 9 | 1.0 |
| 1 | 6371753 | 0.0 | 2018-09-01 | 2015-07-29 | 2018-06-02 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION CENTRO | NaN | 9 | 0.0 |
| 2 | 5928737 | 0.0 | 2019-01-01 | 2016-08-31 | 2018-12-27 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION NORTE GRANDE ARGENTINO | NaN | 9 | 0.0 |
| 3 | 475064 | 0.0 | 2018-12-01 | 2014-07-13 | 2017-11-30 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION CUYO | J55660202XX012 | 9 | 0.0 |
| 4 | 3615172 | 0.0 | 2018-09-01 | 2017-12-27 | 2017-12-28 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION CENTRO | NaN | 9 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 139141 | 1673642 | 0.0 | 2018-11-01 | 2017-08-18 | 2017-09-26 | No | Yes | No | No | No | ... | 1.0 | 0.0 | 0.0 | 0.0 | No | Yes | BUENOS AIRES | J55660104XX012 | 9 | 0.0 |
| 139142 | 6145735 | 1.0 | 2018-11-01 | 2014-10-26 | 2014-10-26 | No | Yes | No | No | No | ... | 1.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION PATAGONICA | J55660202XX012 | 9 | 1.0 |
| 139143 | 5638786 | 1.0 | 2018-11-01 | 2012-12-26 | 2017-03-08 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 1.0 | 0.0 | Yes | No | REGION NORTE GRANDE ARGENTINO | J55660202XX012 | 9 | 1.0 |
| 139144 | 3824781 | 0.0 | 2018-08-01 | 2014-11-27 | 2019-01-04 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | BUENOS AIRES | NaN | 9 | 0.0 |
| 139145 | 6412619 | 0.0 | 2019-01-01 | 2015-07-08 | 2018-06-02 | No | No | No | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION PATAGONICA | NaN | 9 | 0.0 |
139146 rows × 79 columns
training_window['CreditCard_Product'].describe()
count 88590 unique 7 top J55660104XX012 freq 49572 Name: CreditCard_Product, dtype: object
training_window.CreditCard_Product.value_counts()
CreditCard_Product J55660104XX012 49572 J55660202XX012 34554 J55660102XX012 2412 J55660702XX012 1494 J55661002XX012 372 J55660124XX012 180 J55660123XX012 6 Name: count, dtype: int64
Missing values CreditCard_Product que no se pudieron completar¶
training_window[training_window.CreditCard_Product.fillna('Empty') == 'Empty']
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | cantidad_meses | TGT | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 6371753 | 0.0 | 2018-09-01 | 2015-07-29 | 2018-06-02 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION CENTRO | NaN | 9 | 0.0 |
| 2 | 5928737 | 0.0 | 2019-01-01 | 2016-08-31 | 2018-12-27 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION NORTE GRANDE ARGENTINO | NaN | 9 | 0.0 |
| 4 | 3615172 | 0.0 | 2018-09-01 | 2017-12-27 | 2017-12-28 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION CENTRO | NaN | 9 | 0.0 |
| 5 | 6412264 | 0.0 | 2018-09-01 | 2016-01-27 | 2019-01-03 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION NORTE GRANDE ARGENTINO | NaN | 9 | 0.0 |
| 6 | 6318899 | 0.0 | 2018-10-01 | 2015-03-26 | 2018-01-31 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION NORTE GRANDE ARGENTINO | NaN | 9 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 139122 | 5940877 | 0.0 | 2018-08-01 | 2014-04-08 | 2018-12-19 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION NORTE GRANDE ARGENTINO | NaN | 9 | 0.0 |
| 139132 | 6258895 | 0.0 | 2019-01-01 | 2015-03-12 | 2018-03-02 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | BUENOS AIRES | NaN | 9 | 0.0 |
| 139135 | 6351091 | 0.0 | 2018-10-01 | 2015-05-06 | 2018-04-04 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | BUENOS AIRES | NaN | 9 | 0.0 |
| 139144 | 3824781 | 0.0 | 2018-08-01 | 2014-11-27 | 2019-01-04 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | BUENOS AIRES | NaN | 9 | 0.0 |
| 139145 | 6412619 | 0.0 | 2019-01-01 | 2015-07-08 | 2018-06-02 | No | No | No | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION PATAGONICA | NaN | 9 | 0.0 |
50556 rows × 79 columns
data[data.client_id == '5967858']
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_External | CreditCard_Payment_Cash | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product |
|---|
0 rows × 77 columns
training_window['CreditCard_Product'] = np.where(training_window['CreditCard_Product'].isna(),
'No',
training_window['CreditCard_Product'])
training_window[training_window.CreditCard_Product.fillna('Empty') == 'Empty']
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | cantidad_meses | TGT |
|---|
0 rows × 79 columns
training_window.CreditCard_Product.value_counts()
CreditCard_Product No 50556 J55660104XX012 49572 J55660202XX012 34554 J55660102XX012 2412 J55660702XX012 1494 J55661002XX012 372 J55660124XX012 180 J55660123XX012 6 Name: count, dtype: int64
training_window.columns[training_window.isnull().any()].tolist()
[]
Outliers¶
training_window['SavingAccount_Days_with_use'].describe()
count 139146.000000 mean 2.512685 std 3.831400 min 0.000000 25% 0.000000 50% 0.000000 75% 4.000000 max 41.000000 Name: SavingAccount_Days_with_use, dtype: float64
import seaborn as sns
import matplotlib.pyplot as plt
# plot a distribution plot
sns.displot(training_window['SavingAccount_Balance_Average'], kind='kde')
# display the plot
plt.show()
plt.boxplot(training_window['SavingAccount_Balance_Average'])
# add labels for five number summary
plt.text(x = 1.1, y = training_window['SavingAccount_Balance_Average'].min(), s ='min')
plt.text(x = 1.1, y = training_window['SavingAccount_Balance_Average'].quantile(0.25), s ='Q1')
plt.text(x = 1.1, y = training_window['SavingAccount_Balance_Average'].median(), s ='meadian (Q2)')
plt.text(x = 1.1, y = training_window['SavingAccount_Balance_Average'].quantile(0.75), s ='Q3')
plt.text(x = 1.1, y = training_window['SavingAccount_Balance_Average'].max(), s ='max')
# add the graph title and axes labels
plt.title('Boxplot of Total Bill Amount')
plt.ylabel('Total bill')
# display the plot
plt.show()
training_window['SavingAccount_Balance_Average'].describe()
count 1.391460e+05 mean 4.094871e+03 std 2.293939e+04 min -5.665900e+02 25% 0.000000e+00 50% 1.032500e+01 75% 1.473138e+03 max 1.771201e+06 Name: SavingAccount_Balance_Average, dtype: float64
import seaborn as sns
import matplotlib.pyplot as plt
# plot a distribution plot
sns.displot(training_window['SavingAccount_Balance_Average'], kind='kde')
# display the plot
plt.show()
p95 = training_window['SavingAccount_Balance_Average'].quantile(0.95)
p99 = training_window['SavingAccount_Balance_Average'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Balance_Average'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 17043.66 p99 70055.67999999813 Three sigma 68818.178586892
training_window['SavingAccount_Balance_Average'] = np.where(training_window['SavingAccount_Balance_Average'] > three_sigma,
three_sigma,
training_window['SavingAccount_Balance_Average'])
training_window['SavingAccount_Balance_Average'].describe()
count 139146.000000 mean 3179.457488 std 9821.897057 min -566.590000 25% 0.000000 50% 10.325000 75% 1473.137500 max 68818.178587 Name: SavingAccount_Balance_Average, dtype: float64
import seaborn as sns
import matplotlib.pyplot as plt
# plot a distribution plot
sns.displot(training_window['SavingAccount_Balance_Average'], kind='kde')
# display the plot
plt.show()
training_window['SavingAccount_Balance_Average'] = np.where(training_window['SavingAccount_Balance_Average'] > p95,
p95,
training_window['SavingAccount_Balance_Average'])
training_window['SavingAccount_Balance_Average'].describe()
count 139146.000000 mean 2026.112910 std 4351.634834 min -566.590000 25% 0.000000 50% 10.325000 75% 1473.137500 max 17043.660000 Name: SavingAccount_Balance_Average, dtype: float64
import seaborn as sns
import matplotlib.pyplot as plt
# plot a distribution plot
sns.displot(training_window['SavingAccount_Balance_Average'], kind='kde')
# display the plot
plt.show()
Analisis de Outliers de variables 1 al 30¶
variables_analisis_outliers = ['SavingAccount_Days_with_Credits',
'SavingAccount_Days_with_Debits',
'SavingAccount_Salary_Payment_Transactions',
'SavingAccount_Transfer_In_Transactions',
'SavingAccount_ATM_Extraction_Transactions',
'SavingAccount_CreditCard_Payment_Transactions',
'SavingAccount_Transfer_Out_Transactions',
'SavingAccount_DebitCard_Spend_Transactions',
'SavingAccount_Transactions_Transactions',
'SavingAccount_Credits_Transactions',
'SavingAccount_Debits_Transactions',
'SavingAccount_Salary_Payment_Amount',
'SavingAccount_Transfer_In_Amount',
'SavingAccount_ATM_Extraction_Amount',
'SavingAccount_CreditCard_Payment_Amount',
'SavingAccount_Transfer_Out_Amount',
'SavingAccount_DebitCard_Spend_Amount',
'SavingAccount_Total_Amount',
'SavingAccount_Credits_Amounts',
'SavingAccount_Debits_Amounts',
'Operations_HomeBanking',
'Operations_Mobile',
'CreditCard_Balance_ARG',
'CreditCard_Balance_DOLLAR',
'CreditCard_Total_Limit',
'CreditCard_Total_Spending',
'CreditCard_Spending_1_Installment',
'CreditCard_Spending_CrossBoarder',
'CreditCard_Spending_Aut_Debits',
'CreditCard_Revolving']
for variables_analizables in variables_analisis_outliers:
print('************************************************************ ')
print('Variable ', variables_analizables)
print(' ')
training_window[variables_analizables].describe()
# plot a distribution plot
sns.displot(training_window[variables_analizables], kind='kde')
# display the plot
plt.show()
plt.boxplot(training_window[variables_analizables])
# add labels for five number summary
plt.text(x = 1.1, y = training_window[variables_analizables].min(), s ='min')
plt.text(x = 1.1, y = training_window[variables_analizables].quantile(0.25), s ='Q1')
plt.text(x = 1.1, y = training_window[variables_analizables].median(), s ='meadian (Q2)')
plt.text(x = 1.1, y = training_window[variables_analizables].quantile(0.75), s ='Q3')
plt.text(x = 1.1, y = training_window[variables_analizables].max(), s ='max')
# add the graph title and axes labels
plt.title('Boxplot of Total Bill Amount')
plt.ylabel('Total bill')
# display the plot
plt.show()
p95 = training_window[variables_analizables].quantile(0.95)
p99 = training_window[variables_analizables].quantile(0.99)
three_sigma = 3 * training_window[variables_analizables].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
training_window[variables_analizables].describe()
************************************************************ Variable SavingAccount_Days_with_Credits
p95 5.0 p99 8.0 Three sigma 5.3821836766669335 ************************************************************ Variable SavingAccount_Days_with_Debits
p95 9.0 p99 15.0 Three sigma 9.723007390387242 ************************************************************ Variable SavingAccount_Salary_Payment_Transactions
p95 1.0 p99 3.0 Three sigma 1.7306415659542151 ************************************************************ Variable SavingAccount_Transfer_In_Transactions
p95 2.0 p99 4.0 Three sigma 3.4065241487924736 ************************************************************ Variable SavingAccount_ATM_Extraction_Transactions
p95 4.0 p99 12.0 Three sigma 6.793744483785929 ************************************************************ Variable SavingAccount_CreditCard_Payment_Transactions
p95 2.0 p99 4.0 Three sigma 2.820305708750473 ************************************************************ Variable SavingAccount_Transfer_Out_Transactions
p95 0.0 p99 0.0 Three sigma 0.3047514185040877 ************************************************************ Variable SavingAccount_DebitCard_Spend_Transactions
p95 7.0 p99 22.0 Three sigma 12.865522801673926 ************************************************************ Variable SavingAccount_Transactions_Transactions
p95 24.0 p99 47.0 Three sigma 28.791754836314986 ************************************************************ Variable SavingAccount_Credits_Transactions
p95 6.0 p99 11.0 Three sigma 7.817126304056519 ************************************************************ Variable SavingAccount_Debits_Transactions
p95 19.0 p99 38.0 Three sigma 22.894184199079866 ************************************************************ Variable SavingAccount_Salary_Payment_Amount
p95 22890.7775 p99 58568.9505 Three sigma 50991.24006782155 ************************************************************ Variable SavingAccount_Transfer_In_Amount
p95 13000.0 p99 43000.0 Three sigma 46628.46407356864 ************************************************************ Variable SavingAccount_ATM_Extraction_Amount
p95 8600.0 p99 25400.0 Three sigma 15161.853538210948 ************************************************************ Variable SavingAccount_CreditCard_Payment_Amount
p95 11868.3125 p99 33823.049999999756 Three sigma 31971.212947940097 ************************************************************ Variable SavingAccount_Transfer_Out_Amount
p95 0.0 p99 21453.013999999937 Three sigma 47523.41379065669 ************************************************************ Variable SavingAccount_DebitCard_Spend_Amount
p95 6350.3125 p99 20862.84349999993 Three sigma 13168.561377041347 ************************************************************ Variable SavingAccount_Total_Amount
p95 109939.47 p99 527795.7179999923 Three sigma 536040.4438446803 ************************************************************ Variable SavingAccount_Credits_Amounts
p95 54835.455 p99 269663.80649999913 Three sigma 271204.21506443416 ************************************************************ Variable SavingAccount_Debits_Amounts
p95 54108.784999999996 p99 268277.130499996 Three sigma 269760.6696250981 ************************************************************ Variable Operations_HomeBanking
p95 7.0 p99 15.0 Three sigma 8.928026498997632 ************************************************************ Variable Operations_Mobile
p95 5.0 p99 14.0 Three sigma 7.72057758014084 ************************************************************ Variable CreditCard_Balance_ARG
p95 30815.11 p99 55138.89599999998 Three sigma 35962.00973574623 ************************************************************ Variable CreditCard_Balance_DOLLAR
p95 0.99 p99 132.93649999999963 Three sigma 200.20502558797813 ************************************************************ Variable CreditCard_Total_Limit
p95 160000.0 p99 280000.0 Three sigma 184838.11811993475 ************************************************************ Variable CreditCard_Total_Spending
p95 17131.6725 p99 33148.23549999999 Three sigma 21870.01520929554 ************************************************************ Variable CreditCard_Spending_1_Installment
p95 7739.59 p99 19681.75649999997 Three sigma 12897.110532905726 ************************************************************ Variable CreditCard_Spending_CrossBoarder
p95 0.99 p99 143.6204999999987 Three sigma 212.90138519851982 ************************************************************ Variable CreditCard_Spending_Aut_Debits
p95 5619.7275 p99 12510.912499999991 Three sigma 7806.871328247001 ************************************************************ Variable CreditCard_Revolving
p95 17291.8975 p99 36175.25699999951 Three sigma 23840.512630865414
quitando outliers Variable: SavingAccount_Days_with_Credits¶
p95 = training_window['SavingAccount_Days_with_Credits'].quantile(0.95)
p99 = training_window['SavingAccount_Days_with_Credits'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Days_with_Credits'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 5.0 p99 8.0 Three sigma 5.3821836766669335
training_window['SavingAccount_Days_with_Credits'] = np.where(training_window['SavingAccount_Days_with_Credits'] > p99,
p99,
training_window['SavingAccount_Days_with_Credits'])
training_window['SavingAccount_Days_with_Credits'].describe()
count 139146.000000 mean 1.232504 std 1.691454 min 0.000000 25% 0.000000 50% 0.000000 75% 2.000000 max 8.000000 Name: SavingAccount_Days_with_Credits, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Days_with_Credits'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_Days_with_Debits¶
p95 = training_window['SavingAccount_Days_with_Debits'].quantile(0.95)
p99 = training_window['SavingAccount_Days_with_Debits'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Days_with_Debits'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 9.0 p99 15.0 Three sigma 9.723007390387242
training_window['SavingAccount_Days_with_Debits'] = np.where(training_window['SavingAccount_Days_with_Debits'] > three_sigma,
three_sigma,
training_window['SavingAccount_Days_with_Debits'])
training_window['SavingAccount_Days_with_Debits'].describe()
count 139146.000000 mean 1.690302 std 2.649375 min 0.000000 25% 0.000000 50% 0.000000 75% 2.000000 max 9.723007 Name: SavingAccount_Days_with_Debits, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Days_with_Debits'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_Salary_Payment_Transactions¶
p95 = training_window['SavingAccount_Salary_Payment_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_Salary_Payment_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Salary_Payment_Transactions'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 1.0 p99 3.0 Three sigma 1.7306415659542151
training_window['SavingAccount_Salary_Payment_Transactions'] = np.where(training_window['SavingAccount_Salary_Payment_Transactions'] > p99,
p99,
training_window['SavingAccount_Salary_Payment_Transactions'])
training_window['SavingAccount_Salary_Payment_Transactions'].describe()
count 139146.000000 mean 0.142534 std 0.520272 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 3.000000 Name: SavingAccount_Salary_Payment_Transactions, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Days_with_Debits'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_Transfer_In_Transactions¶
p95 = training_window['SavingAccount_Transfer_In_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_Transfer_In_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Transfer_In_Transactions'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 2.0 p99 4.0 Three sigma 3.4065241487924736
training_window['SavingAccount_Transfer_In_Transactions'] = np.where(training_window['SavingAccount_Transfer_In_Transactions'] > p99,
p99,
training_window['SavingAccount_Transfer_In_Transactions'])
training_window['SavingAccount_Transfer_In_Transactions'].describe()
count 139146.000000 mean 0.314993 std 0.750827 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 4.000000 Name: SavingAccount_Transfer_In_Transactions, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Transfer_In_Transactions'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_ATM_Extraction_Transactions¶
p95 = training_window['SavingAccount_ATM_Extraction_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_ATM_Extraction_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_ATM_Extraction_Transactions'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 4.0 p99 12.0 Three sigma 6.793744483785929
training_window['SavingAccount_ATM_Extraction_Transactions'] = np.where(training_window['SavingAccount_ATM_Extraction_Transactions'] > p95,
p95,
training_window['SavingAccount_ATM_Extraction_Transactions'])
training_window['SavingAccount_ATM_Extraction_Transactions'].describe()
count 139146.000000 mean 0.320505 std 1.006634 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 4.000000 Name: SavingAccount_ATM_Extraction_Transactions, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_ATM_Extraction_Transactions'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_CreditCard_Payment_Transactions¶
p95 = training_window['SavingAccount_CreditCard_Payment_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_CreditCard_Payment_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_CreditCard_Payment_Transactions'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 2.0 p99 4.0 Three sigma 2.820305708750473
training_window['SavingAccount_CreditCard_Payment_Transactions'] = np.where(training_window['SavingAccount_CreditCard_Payment_Transactions'] > p99,
p99,
training_window['SavingAccount_CreditCard_Payment_Transactions'])
training_window['SavingAccount_CreditCard_Payment_Transactions'].describe()
count 139146.000000 mean 0.359170 std 0.767203 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 4.000000 Name: SavingAccount_CreditCard_Payment_Transactions, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_ATM_Extraction_Transactions'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_Transfer_Out_Transactions¶
p95 = training_window['SavingAccount_Transfer_Out_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_Transfer_Out_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Transfer_Out_Transactions'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 0.0 p99 0.0 Three sigma 0.3047514185040877
training_window.SavingAccount_Transfer_Out_Transactions.value_counts()
SavingAccount_Transfer_Out_Transactions 0.0 139099 1.0 37 2.0 3 10.0 1 9.0 1 15.0 1 25.0 1 14.0 1 4.0 1 12.0 1 Name: count, dtype: int64
# revisar, preguntar sobre esta variable: SavingAccount_Transfer_Out_Transactions
quitando outliers Variable: SavingAccount_DebitCard_Spend_Transactions¶
p95 = training_window['SavingAccount_DebitCard_Spend_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_DebitCard_Spend_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_DebitCard_Spend_Transactions'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 7.0 p99 22.0 Three sigma 12.865522801673926
training_window['SavingAccount_DebitCard_Spend_Transactions'] = np.where(training_window['SavingAccount_DebitCard_Spend_Transactions'] > p99,
p99,
training_window['SavingAccount_DebitCard_Spend_Transactions'])
training_window['SavingAccount_DebitCard_Spend_Transactions'].describe()
count 139146.000000 mean 0.991390 std 3.528173 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 22.000000 Name: SavingAccount_DebitCard_Spend_Transactions, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_DebitCard_Spend_Transactions'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_Transactions_Transactions¶
p95 = training_window['SavingAccount_Transactions_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_Transactions_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Transactions_Transactions'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 24.0 p99 47.0 Three sigma 28.791754836314986
training_window['SavingAccount_Transactions_Transactions'] = np.where(training_window['SavingAccount_Transactions_Transactions'] > p99,
p99,
training_window['SavingAccount_Transactions_Transactions'])
training_window['SavingAccount_Transactions_Transactions'].describe()
count 139146.000000 mean 4.765656 std 8.710314 min 0.000000 25% 0.000000 50% 0.000000 75% 5.000000 max 47.000000 Name: SavingAccount_Transactions_Transactions, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Transactions_Transactions'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_Credits_Transactions¶
p95 = training_window['SavingAccount_Credits_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_Credits_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Credits_Transactions'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 6.0 p99 11.0 Three sigma 7.817126304056519
training_window['SavingAccount_Credits_Transactions'] = np.where(training_window['SavingAccount_Credits_Transactions'] > p99,
p99,
training_window['SavingAccount_Credits_Transactions'])
training_window['SavingAccount_Credits_Transactions'].describe()
count 139146.000000 mean 1.509573 std 2.220993 min 0.000000 25% 0.000000 50% 0.000000 75% 2.000000 max 11.000000 Name: SavingAccount_Credits_Transactions, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Credits_Transactions'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_Debits_Transactions¶
p95 = training_window['SavingAccount_Debits_Transactions'].quantile(0.95)
p99 = training_window['SavingAccount_Debits_Transactions'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Debits_Transactions'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 19.0 p99 38.0 Three sigma 22.894184199079866
training_window['SavingAccount_Debits_Transactions'] = np.where(training_window['SavingAccount_Debits_Transactions'] > p99,
p99,
training_window['SavingAccount_Debits_Transactions'])
training_window['SavingAccount_Debits_Transactions'].describe()
count 139146.000000 mean 3.222780 std 6.824929 min 0.000000 25% 0.000000 50% 0.000000 75% 3.000000 max 38.000000 Name: SavingAccount_Debits_Transactions, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Debits_Transactions'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_Salary_Payment_Amount¶
p95 = training_window['SavingAccount_Salary_Payment_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_Salary_Payment_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Salary_Payment_Amount'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 22890.7775 p99 58568.9505 Three sigma 50991.24006782155
training_window['SavingAccount_Salary_Payment_Amount'] = np.where(training_window['SavingAccount_Salary_Payment_Amount'] > p95,
p95,
training_window['SavingAccount_Salary_Payment_Amount'])
training_window['SavingAccount_Salary_Payment_Amount'].describe()
count 139146.000000 mean 1621.769465 std 5617.856081 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 22890.777500 Name: SavingAccount_Salary_Payment_Amount, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Salary_Payment_Amount'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_Transfer_In_Amount¶
p95 = training_window['SavingAccount_Transfer_In_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_Transfer_In_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Transfer_In_Amount'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 13000.0 p99 43000.0 Three sigma 46628.46407356864
training_window['SavingAccount_Transfer_In_Amount'] = np.where(training_window['SavingAccount_Transfer_In_Amount'] > p95,
p95,
training_window['SavingAccount_Transfer_In_Amount'])
training_window['SavingAccount_Transfer_In_Amount'].describe()
count 139146.000000 mean 1420.367686 std 3444.238763 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 13000.000000 Name: SavingAccount_Transfer_In_Amount, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Transfer_In_Amount'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_ATM_Extraction_Amount¶
p95 = training_window['SavingAccount_ATM_Extraction_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_ATM_Extraction_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_ATM_Extraction_Amount'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 8600.0 p99 25400.0 Three sigma 15161.853538210948
training_window['SavingAccount_ATM_Extraction_Amount'] = np.where(training_window['SavingAccount_ATM_Extraction_Amount'] > p95,
p95,
training_window['SavingAccount_ATM_Extraction_Amount'])
training_window['SavingAccount_ATM_Extraction_Amount'].describe()
count 139146.000000 mean 642.253338 std 2102.709382 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 8600.000000 Name: SavingAccount_ATM_Extraction_Amount, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_ATM_Extraction_Amount'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_CreditCard_Payment_Amount¶
p95 = training_window['SavingAccount_CreditCard_Payment_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_CreditCard_Payment_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_CreditCard_Payment_Amount'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 11868.3125 p99 33823.049999999756 Three sigma 31971.212947940097
training_window['SavingAccount_CreditCard_Payment_Amount'] = np.where(training_window['SavingAccount_CreditCard_Payment_Amount'] > three_sigma,
three_sigma,
training_window['SavingAccount_CreditCard_Payment_Amount'])
training_window['SavingAccount_CreditCard_Payment_Amount'].describe()
count 139146.000000 mean 1840.757643 std 5226.661125 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 31971.212948 Name: SavingAccount_CreditCard_Payment_Amount, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_CreditCard_Payment_Amount'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_Transfer_Out_Amount¶
p95 = training_window['SavingAccount_Transfer_Out_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_Transfer_Out_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Transfer_Out_Amount'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 0.0 p99 21453.013999999937 Three sigma 47523.41379065669
training_window['SavingAccount_Transfer_Out_Amount'] = np.where(training_window['SavingAccount_Transfer_Out_Amount'] > p99,
p99,
training_window['SavingAccount_Transfer_Out_Amount'])
training_window['SavingAccount_Transfer_Out_Amount'].describe()
count 139146.000000 mean 451.299070 std 2652.509583 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 21453.014000 Name: SavingAccount_Transfer_Out_Amount, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Transfer_Out_Amount'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_DebitCard_Spend_Amount¶
p95 = training_window['SavingAccount_DebitCard_Spend_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_DebitCard_Spend_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_DebitCard_Spend_Amount'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 6350.3125 p99 20862.84349999993 Three sigma 13168.561377041347
training_window['SavingAccount_DebitCard_Spend_Amount'] = np.where(training_window['SavingAccount_DebitCard_Spend_Amount'] > p95,
p95,
training_window['SavingAccount_DebitCard_Spend_Amount'])
training_window['SavingAccount_DebitCard_Spend_Amount'].describe()
count 139146.000000 mean 504.030718 std 1567.015956 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 6350.312500 Name: SavingAccount_DebitCard_Spend_Amount, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_DebitCard_Spend_Amount'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_Total_Amount¶
p95 = training_window['SavingAccount_Total_Amount'].quantile(0.95)
p99 = training_window['SavingAccount_Total_Amount'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Total_Amount'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 109939.47 p99 527795.7179999923 Three sigma 536040.4438446803
training_window['SavingAccount_Total_Amount'] = np.where(training_window['SavingAccount_Total_Amount'] > p95,
p95,
training_window['SavingAccount_Total_Amount'])
training_window['SavingAccount_Total_Amount'].describe()
count 139146.000000 mean 15375.232616 std 29201.571404 min 0.000000 25% 0.000000 50% 0.000000 75% 14466.357500 max 109939.470000 Name: SavingAccount_Total_Amount, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Total_Amount'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: SavingAccount_Credits_Amounts¶
p95 = training_window['SavingAccount_Credits_Amounts'].quantile(0.95)
p99 = training_window['SavingAccount_Credits_Amounts'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Credits_Amounts'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 54835.455 p99 269663.80649999913 Three sigma 271204.21506443416
training_window['SavingAccount_Credits_Amounts'] = np.where(training_window['SavingAccount_Credits_Amounts'] > p95,
p95,
training_window['SavingAccount_Credits_Amounts'])
training_window['SavingAccount_Credits_Amounts'].describe()
count 139146.000000 mean 7558.049240 std 14577.499232 min 0.000000 25% 0.000000 50% 0.000000 75% 7000.250000 max 54835.455000 Name: SavingAccount_Credits_Amounts, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Credits_Amounts'], kind='kde')
# display the plot
plt.show()
## quitando outliers Variable: SavingAccount_Debits_Amounts
p95 = training_window['SavingAccount_Debits_Amounts'].quantile(0.95)
p99 = training_window['SavingAccount_Debits_Amounts'].quantile(0.99)
three_sigma = 3 * training_window['SavingAccount_Debits_Amounts'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 54108.784999999996 p99 268277.130499996 Three sigma 269760.6696250981
training_window['SavingAccount_Debits_Amounts'] = np.where(training_window['SavingAccount_Debits_Amounts'] > p95,
p95,
training_window['SavingAccount_Debits_Amounts'])
training_window['SavingAccount_Debits_Amounts'].describe()
count 139146.000000 mean 7539.658027 std 14341.236934 min 0.000000 25% 0.000000 50% 0.000000 75% 7102.057500 max 54108.785000 Name: SavingAccount_Debits_Amounts, dtype: float64
# plot a distribution plot
sns.displot(training_window['SavingAccount_Debits_Amounts'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: Operations_HomeBanking¶
p95 = training_window['Operations_HomeBanking'].quantile(0.95)
p99 = training_window['Operations_HomeBanking'].quantile(0.99)
three_sigma = 3 * training_window['Operations_HomeBanking'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 7.0 p99 15.0 Three sigma 8.928026498997632
training_window['Operations_HomeBanking'] = np.where(training_window['Operations_HomeBanking'] > p99,
p99,
training_window['Operations_HomeBanking'])
training_window['Operations_HomeBanking'].describe()
count 139146.000000 mean 1.213186 std 2.761946 min 0.000000 25% 0.000000 50% 0.000000 75% 1.000000 max 15.000000 Name: Operations_HomeBanking, dtype: float64
# plot a distribution plot
sns.displot(training_window['Operations_HomeBanking'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: Operations_Mobile¶
p95 = training_window['Operations_Mobile'].quantile(0.95)
p99 = training_window['Operations_Mobile'].quantile(0.99)
three_sigma = 3 * training_window['Operations_Mobile'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 5.0 p99 14.0 Three sigma 7.72057758014084
training_window['Operations_Mobile'] = np.where(training_window['Operations_Mobile'] > p99,
p99,
training_window['Operations_Mobile'])
training_window['Operations_Mobile'].describe()
count 139146.000000 mean 0.624617 std 2.277957 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 14.000000 Name: Operations_Mobile, dtype: float64
# plot a distribution plot
sns.displot(training_window['Operations_Mobile'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: CreditCard_Balance_ARG¶
p95 = training_window['CreditCard_Balance_ARG'].quantile(0.95)
p99 = training_window['CreditCard_Balance_ARG'].quantile(0.99)
np99 = -training_window['CreditCard_Balance_ARG'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Balance_ARG'].std()
print('p95 ', p95)
print('p99 ', p99)
print('np99 ', np99)
print('Three sigma ', three_sigma)
p95 30815.11 p99 55138.89599999998 np99 -55138.89599999998 Three sigma 35962.00973574623
training_window['CreditCard_Balance_ARG'] = np.where(training_window['CreditCard_Balance_ARG'] > p99,
p99,
training_window['CreditCard_Balance_ARG'])
training_window['CreditCard_Balance_ARG'] = np.where(training_window['CreditCard_Balance_ARG'] < np99,
np99,
training_window['CreditCard_Balance_ARG'])
training_window['CreditCard_Balance_ARG'].describe()
count 139146.000000 mean 7084.210404 std 10832.826535 min -55138.896000 25% 0.000000 50% 2716.210000 75% 9483.642500 max 55138.896000 Name: CreditCard_Balance_ARG, dtype: float64
# plot a distribution plot
sns.displot(training_window['CreditCard_Balance_ARG'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: CreditCard_Balance_DOLLAR¶
p95 = training_window['CreditCard_Balance_DOLLAR'].quantile(0.95)
p99 = training_window['CreditCard_Balance_DOLLAR'].quantile(0.99)
np99 = -training_window['CreditCard_Balance_DOLLAR'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Balance_DOLLAR'].std()
print('p95 ', p95)
print('p99 ', p99)
print('np99 ', np99)
print('Three sigma ', three_sigma)
p95 0.99 p99 132.93649999999963 np99 -132.93649999999963 Three sigma 200.20502558797813
training_window['CreditCard_Balance_DOLLAR'] = np.where(training_window['CreditCard_Balance_DOLLAR'] > p99,
p99,
training_window['CreditCard_Balance_DOLLAR'])
training_window['CreditCard_Balance_DOLLAR'] = np.where(training_window['CreditCard_Balance_DOLLAR'] < np99,
np99,
training_window['CreditCard_Balance_DOLLAR'])
training_window['CreditCard_Balance_DOLLAR'].describe()
count 139146.000000 mean 2.161760 std 15.995588 min -132.936500 25% 0.000000 50% 0.000000 75% 0.000000 max 132.936500 Name: CreditCard_Balance_DOLLAR, dtype: float64
# plot a distribution plot
sns.displot(training_window['CreditCard_Balance_DOLLAR'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: CreditCard_Total_Limit¶
p95 = training_window['CreditCard_Total_Limit'].quantile(0.95)
p99 = training_window['CreditCard_Total_Limit'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Total_Limit'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 160000.0 p99 280000.0 Three sigma 184838.11811993475
training_window['CreditCard_Total_Limit'] = np.where(training_window['CreditCard_Total_Limit'] > p99,
p99,
training_window['CreditCard_Total_Limit'])
training_window['CreditCard_Total_Limit'].describe()
count 139146.000000 mean 51060.982709 std 56753.331259 min 0.000000 25% 0.000000 50% 40000.000000 75% 64000.000000 max 280000.000000 Name: CreditCard_Total_Limit, dtype: float64
# plot a distribution plot
sns.displot(training_window['CreditCard_Total_Limit'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: CreditCard_Total_Spending¶
p95 = training_window['CreditCard_Total_Spending'].quantile(0.95)
p99 = training_window['CreditCard_Total_Spending'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Total_Spending'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 17131.6725 p99 33148.23549999999 Three sigma 21870.01520929554
training_window['CreditCard_Total_Spending'] = np.where(training_window['CreditCard_Total_Spending'] > p99,
p99,
training_window['CreditCard_Total_Spending'])
training_window['CreditCard_Total_Spending'].describe()
count 139146.000000 mean 4076.655867 std 6249.673704 min -30164.770000 25% 0.000000 50% 1482.710000 75% 5656.147500 max 33148.235500 Name: CreditCard_Total_Spending, dtype: float64
# plot a distribution plot
sns.displot(training_window['CreditCard_Total_Spending'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: CreditCard_Spending_1_Installment¶
p95 = training_window['CreditCard_Spending_1_Installment'].quantile(0.95)
p99 = training_window['CreditCard_Spending_1_Installment'].quantile(0.99)
np99 = -training_window['CreditCard_Spending_1_Installment'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Spending_1_Installment'].std()
print('p95 ', p95)
print('p99 ', p99)
print('np99 ', np99)
print('Three sigma ', three_sigma)
p95 7739.59 p99 19681.75649999997 np99 -19681.75649999997 Three sigma 12897.110532905726
training_window['CreditCard_Spending_1_Installment'] = np.where(training_window['CreditCard_Spending_1_Installment'] > p99,
p99,
training_window['CreditCard_Spending_1_Installment'])
training_window['CreditCard_Spending_1_Installment'] = np.where(training_window['CreditCard_Spending_1_Installment'] < np99,
np99,
training_window['CreditCard_Spending_1_Installment'])
training_window['CreditCard_Spending_1_Installment'].describe()
count 139146.000000 mean 1268.888989 std 3255.660124 min -19681.756500 25% 0.000000 50% 0.000000 75% 656.180000 max 19681.756500 Name: CreditCard_Spending_1_Installment, dtype: float64
# plot a distribution plot
sns.displot(training_window['CreditCard_Spending_1_Installment'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: CreditCard_Spending_CrossBoarder¶
p95 = training_window['CreditCard_Spending_CrossBoarder'].quantile(0.95)
p99 = training_window['CreditCard_Spending_CrossBoarder'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Spending_CrossBoarder'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 0.99 p99 143.6204999999987 Three sigma 212.90138519851982
training_window['CreditCard_Spending_CrossBoarder'] = np.where(training_window['CreditCard_Spending_CrossBoarder'] > p99,
p99,
training_window['CreditCard_Spending_CrossBoarder'])
training_window['CreditCard_Spending_CrossBoarder'].describe()
count 139146.000000 mean 2.589213 std 16.595921 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 143.620500 Name: CreditCard_Spending_CrossBoarder, dtype: float64
# plot a distribution plot
sns.displot(training_window['CreditCard_Spending_CrossBoarder'], kind='kde')
# display the plot
plt.show()
quitando outliers Variable: CreditCard_Spending_Aut_Debits¶
p95 = training_window['CreditCard_Spending_Aut_Debits'].quantile(0.95)
p99 = training_window['CreditCard_Spending_Aut_Debits'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Spending_Aut_Debits'].std()
print('p95 ', p95)
print('p99 ', p99)
print('Three sigma ', three_sigma)
p95 5619.7275 p99 12510.912499999991 Three sigma 7806.871328247001
training_window['CreditCard_Spending_Aut_Debits'] = np.where(training_window['CreditCard_Spending_Aut_Debits'] > p99,
p99,
training_window['CreditCard_Spending_Aut_Debits'])
training_window['CreditCard_Spending_Aut_Debits'].describe()
count 139146.000000 mean 1033.300702 std 2209.574033 min -9476.360000 25% 0.000000 50% 0.000000 75% 947.745000 max 12510.912500 Name: CreditCard_Spending_Aut_Debits, dtype: float64
# plot a distribution plot
sns.displot(training_window['CreditCard_Spending_Aut_Debits'], kind='kde')
# display the plot
plt.show()
## quitando outliers Variable: CreditCard_Revolving
p95 = training_window['CreditCard_Revolving'].quantile(0.95)
p99 = training_window['CreditCard_Revolving'].quantile(0.99)
np99 = -training_window['CreditCard_Revolving'].quantile(0.99)
three_sigma = 3 * training_window['CreditCard_Revolving'].std()
print('p95 ', p95)
print('p99 ', p99)
print('np99 ', np99)
print('Three sigma ', three_sigma)
p95 17291.8975 p99 36175.25699999951 np99 -36175.25699999951 Three sigma 23840.512630865414
training_window['CreditCard_Revolving'] = np.where(training_window['CreditCard_Revolving'] > p99,
p99,
training_window['CreditCard_Revolving'])
training_window['CreditCard_Revolving'] = np.where(training_window['CreditCard_Revolving'] < np99,
np99,
training_window['CreditCard_Revolving'])
training_window['CreditCard_Revolving'].describe()
count 139146.000000 mean 2040.762848 std 6810.370574 min -36175.257000 25% 0.000000 50% 0.000000 75% 0.000000 max 36175.257000 Name: CreditCard_Revolving, dtype: float64
# plot a distribution plot
sns.displot(training_window['CreditCard_Revolving'], kind='kde')
# display the plot
plt.show()
Transform Features¶
training_window
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | cantidad_meses | TGT | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5856970 | 1.0 | 2018-10-01 | 2013-10-23 | 2019-01-10 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 1.0 | Yes | Yes | AMBA Resto | J55660202XX012 | 9 | 1.0 |
| 1 | 6371753 | 0.0 | 2018-09-01 | 2015-07-29 | 2018-06-02 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION CENTRO | No | 9 | 0.0 |
| 2 | 5928737 | 0.0 | 2019-01-01 | 2016-08-31 | 2018-12-27 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION NORTE GRANDE ARGENTINO | No | 9 | 0.0 |
| 3 | 475064 | 0.0 | 2018-12-01 | 2014-07-13 | 2017-11-30 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION CUYO | J55660202XX012 | 9 | 0.0 |
| 4 | 3615172 | 0.0 | 2018-09-01 | 2017-12-27 | 2017-12-28 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION CENTRO | No | 9 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 139141 | 1673642 | 0.0 | 2018-11-01 | 2017-08-18 | 2017-09-26 | No | Yes | No | No | No | ... | 1.0 | 0.0 | 0.0 | 0.0 | No | Yes | BUENOS AIRES | J55660104XX012 | 9 | 0.0 |
| 139142 | 6145735 | 1.0 | 2018-11-01 | 2014-10-26 | 2014-10-26 | No | Yes | No | No | No | ... | 1.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | REGION PATAGONICA | J55660202XX012 | 9 | 1.0 |
| 139143 | 5638786 | 1.0 | 2018-11-01 | 2012-12-26 | 2017-03-08 | No | Yes | No | No | No | ... | 0.0 | 0.0 | 1.0 | 0.0 | Yes | No | REGION NORTE GRANDE ARGENTINO | J55660202XX012 | 9 | 1.0 |
| 139144 | 3824781 | 0.0 | 2018-08-01 | 2014-11-27 | 2019-01-04 | No | No | No | No | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | Yes | BUENOS AIRES | No | 9 | 0.0 |
| 139145 | 6412619 | 0.0 | 2019-01-01 | 2015-07-08 | 2018-06-02 | No | No | No | Yes | No | ... | 0.0 | 0.0 | 0.0 | 0.0 | Yes | No | REGION PATAGONICA | No | 9 | 0.0 |
139146 rows × 79 columns
variables con valores (Yes/No), (M/F) y las edades¶
columnas = ['CreditCard_Premium','CreditCard_Active','CreditCard_CoBranding','Loan_Active',
'Mortgage_Active', 'SavingAccount_Active_ARG_Salary','SavingAccount_Active_ARG','SavingAccount_Active_DOLLAR'
,'DebitCard_Active','Investment_Active','Package_Active','Insurance_Life'
,'Insurance_Home','Insurance_Accidents','Insurance_Mobile','Insurance_ATM','Insurance_Unemployment','Mobile','Email']
for columna in columnas:
training_window[columna] = np.where(training_window[columna] == 'Yes',1,0)
#----------
training_window['Sex'] = np.where(training_window['Sex'] == 'F', 0, 1)
#---------------
di = {
"Entre 40 y 49 años" : 40,
"Entre 30 y 39 años" : 30,
"Entre 50 y 59 años" : 50,
"Entre 60 y 64 años" : 60,
"Entre 65 y 69 años" : 65,
"Entre 18 y 29 años" : 18,
"Mayor a 70 años" : 70,
"Menor a 18 años" : 17,
}
training_window.Client_Age_grp = training_window.Client_Age_grp.map(di)
training_window['Client_Age_grp'].value_counts()
Client_Age_grp 40 38313 30 34063 50 30588 60 12282 65 9685 18 7509 70 6701 17 5 Name: count, dtype: int64
training_window
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Payment_Web | CreditCard_Payment_ATM | CreditCard_Payment_TAS | Investment_Numbers | Mobile | Region | CreditCard_Product | cantidad_meses | TGT | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5856970 | 1.0 | 2018-10-01 | 2013-10-23 | 2019-01-10 | 0 | 1 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 1 | 1 | AMBA Resto | J55660202XX012 | 9 | 1.0 |
| 1 | 6371753 | 0.0 | 2018-09-01 | 2015-07-29 | 2018-06-02 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 0 | REGION CENTRO | No | 9 | 0.0 |
| 2 | 5928737 | 0.0 | 2019-01-01 | 2016-08-31 | 2018-12-27 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | REGION NORTE GRANDE ARGENTINO | No | 9 | 0.0 |
| 3 | 475064 | 0.0 | 2018-12-01 | 2014-07-13 | 2017-11-30 | 0 | 1 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | REGION CUYO | J55660202XX012 | 9 | 0.0 |
| 4 | 3615172 | 0.0 | 2018-09-01 | 2017-12-27 | 2017-12-28 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 0 | REGION CENTRO | No | 9 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 139141 | 1673642 | 0.0 | 2018-11-01 | 2017-08-18 | 2017-09-26 | 0 | 1 | 0 | 0 | 0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0 | 1 | BUENOS AIRES | J55660104XX012 | 9 | 0.0 |
| 139142 | 6145735 | 1.0 | 2018-11-01 | 2014-10-26 | 2014-10-26 | 0 | 1 | 0 | 0 | 0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | REGION PATAGONICA | J55660202XX012 | 9 | 1.0 |
| 139143 | 5638786 | 1.0 | 2018-11-01 | 2012-12-26 | 2017-03-08 | 0 | 1 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 1 | 0 | REGION NORTE GRANDE ARGENTINO | J55660202XX012 | 9 | 1.0 |
| 139144 | 3824781 | 0.0 | 2018-08-01 | 2014-11-27 | 2019-01-04 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 1 | BUENOS AIRES | No | 9 | 0.0 |
| 139145 | 6412619 | 0.0 | 2019-01-01 | 2015-07-08 | 2018-06-02 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 0 | REGION PATAGONICA | No | 9 | 0.0 |
139146 rows × 79 columns
One Hot Encoding en variable Region¶
training_window = pd.get_dummies(training_window, columns = ['Region'])
training_window
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Product | cantidad_meses | TGT | Region_AMBA Resto | Region_BUENOS AIRES | Region_CABA Centro/Norte | Region_REGION CENTRO | Region_REGION CUYO | Region_REGION NORTE GRANDE ARGENTINO | Region_REGION PATAGONICA | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5856970 | 1.0 | 2018-10-01 | 2013-10-23 | 2019-01-10 | 0 | 1 | 0 | 0 | 0 | ... | J55660202XX012 | 9 | 1.0 | True | False | False | False | False | False | False |
| 1 | 6371753 | 0.0 | 2018-09-01 | 2015-07-29 | 2018-06-02 | 0 | 0 | 0 | 0 | 0 | ... | No | 9 | 0.0 | False | False | False | True | False | False | False |
| 2 | 5928737 | 0.0 | 2019-01-01 | 2016-08-31 | 2018-12-27 | 0 | 0 | 0 | 0 | 0 | ... | No | 9 | 0.0 | False | False | False | False | False | True | False |
| 3 | 475064 | 0.0 | 2018-12-01 | 2014-07-13 | 2017-11-30 | 0 | 1 | 0 | 0 | 0 | ... | J55660202XX012 | 9 | 0.0 | False | False | False | False | True | False | False |
| 4 | 3615172 | 0.0 | 2018-09-01 | 2017-12-27 | 2017-12-28 | 0 | 0 | 0 | 0 | 0 | ... | No | 9 | 0.0 | False | False | False | True | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 139141 | 1673642 | 0.0 | 2018-11-01 | 2017-08-18 | 2017-09-26 | 0 | 1 | 0 | 0 | 0 | ... | J55660104XX012 | 9 | 0.0 | False | True | False | False | False | False | False |
| 139142 | 6145735 | 1.0 | 2018-11-01 | 2014-10-26 | 2014-10-26 | 0 | 1 | 0 | 0 | 0 | ... | J55660202XX012 | 9 | 1.0 | False | False | False | False | False | False | True |
| 139143 | 5638786 | 1.0 | 2018-11-01 | 2012-12-26 | 2017-03-08 | 0 | 1 | 0 | 0 | 0 | ... | J55660202XX012 | 9 | 1.0 | False | False | False | False | False | True | False |
| 139144 | 3824781 | 0.0 | 2018-08-01 | 2014-11-27 | 2019-01-04 | 0 | 0 | 0 | 0 | 0 | ... | No | 9 | 0.0 | False | True | False | False | False | False | False |
| 139145 | 6412619 | 0.0 | 2019-01-01 | 2015-07-08 | 2018-06-02 | 0 | 0 | 0 | 1 | 0 | ... | No | 9 | 0.0 | False | False | False | False | False | False | True |
139146 rows × 85 columns
columnas = ['Region_AMBA Resto','Region_BUENOS AIRES','Region_CABA Centro/Norte',
'Region_REGION CENTRO', 'Region_REGION CUYO','Region_REGION NORTE GRANDE ARGENTINO','Region_REGION PATAGONICA']
for columna in columnas:
training_window[columna] = np.where(training_window[columna] == True,1,0)
training_window
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | CreditCard_CoBranding | Loan_Active | Mortgage_Active | ... | CreditCard_Product | cantidad_meses | TGT | Region_AMBA Resto | Region_BUENOS AIRES | Region_CABA Centro/Norte | Region_REGION CENTRO | Region_REGION CUYO | Region_REGION NORTE GRANDE ARGENTINO | Region_REGION PATAGONICA | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5856970 | 1.0 | 2018-10-01 | 2013-10-23 | 2019-01-10 | 0 | 1 | 0 | 0 | 0 | ... | J55660202XX012 | 9 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 6371753 | 0.0 | 2018-09-01 | 2015-07-29 | 2018-06-02 | 0 | 0 | 0 | 0 | 0 | ... | No | 9 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 5928737 | 0.0 | 2019-01-01 | 2016-08-31 | 2018-12-27 | 0 | 0 | 0 | 0 | 0 | ... | No | 9 | 0.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 475064 | 0.0 | 2018-12-01 | 2014-07-13 | 2017-11-30 | 0 | 1 | 0 | 0 | 0 | ... | J55660202XX012 | 9 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4 | 3615172 | 0.0 | 2018-09-01 | 2017-12-27 | 2017-12-28 | 0 | 0 | 0 | 0 | 0 | ... | No | 9 | 0.0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 139141 | 1673642 | 0.0 | 2018-11-01 | 2017-08-18 | 2017-09-26 | 0 | 1 | 0 | 0 | 0 | ... | J55660104XX012 | 9 | 0.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 139142 | 6145735 | 1.0 | 2018-11-01 | 2014-10-26 | 2014-10-26 | 0 | 1 | 0 | 0 | 0 | ... | J55660202XX012 | 9 | 1.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 139143 | 5638786 | 1.0 | 2018-11-01 | 2012-12-26 | 2017-03-08 | 0 | 1 | 0 | 0 | 0 | ... | J55660202XX012 | 9 | 1.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 139144 | 3824781 | 0.0 | 2018-08-01 | 2014-11-27 | 2019-01-04 | 0 | 0 | 0 | 0 | 0 | ... | No | 9 | 0.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 139145 | 6412619 | 0.0 | 2019-01-01 | 2015-07-08 | 2018-06-02 | 0 | 0 | 0 | 1 | 0 | ... | No | 9 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
139146 rows × 85 columns
Identity Features to ABT¶
training_window.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 139146 entries, 0 to 139145 Data columns (total 85 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 client_id 139146 non-null int32 1 Target 139146 non-null float64 2 Month 139146 non-null object 3 First_product_dt 139146 non-null object 4 Last_product_dt 139146 non-null object 5 CreditCard_Premium 139146 non-null int32 6 CreditCard_Active 139146 non-null int32 7 CreditCard_CoBranding 139146 non-null int32 8 Loan_Active 139146 non-null int32 9 Mortgage_Active 139146 non-null int32 10 SavingAccount_Active_ARG_Salary 139146 non-null int32 11 SavingAccount_Active_ARG 139146 non-null int32 12 SavingAccount_Active_DOLLAR 139146 non-null int32 13 DebitCard_Active 139146 non-null int32 14 Investment_Active 139146 non-null int32 15 Package_Active 139146 non-null int32 16 Insurance_Life 139146 non-null int32 17 Insurance_Home 139146 non-null int32 18 Insurance_Accidents 139146 non-null int32 19 Insurance_Mobile 139146 non-null int32 20 Insurance_ATM 139146 non-null int32 21 Insurance_Unemployment 139146 non-null int32 22 Sex 139146 non-null int32 23 Client_Age_grp 139146 non-null int64 24 SavingAccount_Balance_FirstDate 139146 non-null float64 25 SavingAccount_Balance_LastDate 139146 non-null float64 26 SavingAccount_Balance_Average 139146 non-null float64 27 SavingAccount_Days_with_use 139146 non-null float64 28 SavingAccount_Days_with_Credits 139146 non-null float64 29 SavingAccount_Days_with_Debits 139146 non-null float64 30 SavingAccount_Salary_Payment_Transactions 139146 non-null float64 31 SavingAccount_Transfer_In_Transactions 139146 non-null float64 32 SavingAccount_ATM_Extraction_Transactions 139146 non-null float64 33 SavingAccount_Service_Payment_Transactions 139146 non-null float64 34 SavingAccount_CreditCard_Payment_Transactions 139146 non-null float64 35 SavingAccount_Transfer_Out_Transactions 139146 non-null float64 36 SavingAccount_DebitCard_Spend_Transactions 139146 non-null float64 37 SavingAccount_Transactions_Transactions 139146 non-null float64 38 SavingAccount_Credits_Transactions 139146 non-null float64 39 SavingAccount_Debits_Transactions 139146 non-null float64 40 SavingAccount_Salary_Payment_Amount 139146 non-null float64 41 SavingAccount_Transfer_In_Amount 139146 non-null float64 42 SavingAccount_ATM_Extraction_Amount 139146 non-null float64 43 SavingAccount_Service_Payment_Amount 139146 non-null float64 44 SavingAccount_CreditCard_Payment_Amount 139146 non-null float64 45 SavingAccount_Transfer_Out_Amount 139146 non-null float64 46 SavingAccount_DebitCard_Spend_Amount 139146 non-null float64 47 SavingAccount_Total_Amount 139146 non-null float64 48 SavingAccount_Credits_Amounts 139146 non-null float64 49 SavingAccount_Debits_Amounts 139146 non-null float64 50 Operations_Bank 139146 non-null float64 51 Operations_Terminal 139146 non-null float64 52 Operations_HomeBanking 139146 non-null float64 53 Operations_Mobile 139146 non-null float64 54 Operations_Ivr 139146 non-null float64 55 Operations_Telemarketer 139146 non-null float64 56 Operations_ATM 139146 non-null float64 57 CreditCard_Balance_ARG 139146 non-null float64 58 CreditCard_Balance_DOLLAR 139146 non-null float64 59 CreditCard_Total_Limit 139146 non-null float64 60 CreditCard_Total_Spending 139146 non-null float64 61 CreditCard_Spending_1_Installment 139146 non-null float64 62 CreditCard_Spending_Installments 139146 non-null float64 63 CreditCard_Spending_CrossBoarder 139146 non-null float64 64 CreditCard_Spending_Aut_Debits 139146 non-null float64 65 CreditCard_Revolving 139146 non-null float64 66 CreditCard_Payment_Aut_Debit 139146 non-null float64 67 CreditCard_Payment_External 139146 non-null float64 68 CreditCard_Payment_Cash 139146 non-null float64 69 CreditCard_Payment_Web 139146 non-null float64 70 CreditCard_Payment_ATM 139146 non-null float64 71 CreditCard_Payment_TAS 139146 non-null float64 72 Investment_Numbers 139146 non-null float64 73 Mobile 139146 non-null int32 74 Email 139146 non-null int32 75 CreditCard_Product 139146 non-null object 76 cantidad_meses 139146 non-null int64 77 TGT 139146 non-null float64 78 Region_AMBA Resto 139146 non-null int32 79 Region_BUENOS AIRES 139146 non-null int32 80 Region_CABA Centro/Norte 139146 non-null int32 81 Region_REGION CENTRO 139146 non-null int32 82 Region_REGION CUYO 139146 non-null int32 83 Region_REGION NORTE GRANDE ARGENTINO 139146 non-null int32 84 Region_REGION PATAGONICA 139146 non-null int32 dtypes: float64(51), int32(28), int64(2), object(4) memory usage: 75.4+ MB
identity_features = training_window[training_window.Month == '2019-01-01'][['client_id',
'Target',
'Month',
'First_product_dt',
'Last_product_dt',
'CreditCard_Premium',
'CreditCard_Active',
'Loan_Active',
'Mortgage_Active',
'DebitCard_Active',
'Investment_Active',
'Sex',
'Client_Age_grp',
'Mobile',
'Email',
'CreditCard_Product',
'Region_AMBA Resto',
'Region_BUENOS AIRES',
'Region_CABA Centro/Norte',
'Region_REGION CENTRO',
'Region_REGION CUYO',
'Region_REGION NORTE GRANDE ARGENTINO',
'Region_REGION PATAGONICA',
'SavingAccount_Active_ARG_Salary',
'SavingAccount_Active_ARG',
'SavingAccount_Active_DOLLAR',
'SavingAccount_Days_with_Credits',
'SavingAccount_Days_with_Debits',
'SavingAccount_Salary_Payment_Transactions',
'SavingAccount_Transfer_In_Transactions',
'SavingAccount_ATM_Extraction_Transactions',
'SavingAccount_CreditCard_Payment_Transactions',
'SavingAccount_Transfer_Out_Transactions',
'SavingAccount_DebitCard_Spend_Transactions',
'SavingAccount_Transactions_Transactions',
'SavingAccount_Credits_Transactions',
'SavingAccount_Debits_Transactions',
'SavingAccount_Salary_Payment_Amount',
'SavingAccount_Transfer_In_Amount',
'SavingAccount_ATM_Extraction_Amount',
'SavingAccount_CreditCard_Payment_Amount',
'SavingAccount_Transfer_Out_Amount',
'SavingAccount_DebitCard_Spend_Amount',
'SavingAccount_Total_Amount',
'SavingAccount_Credits_Amounts',
'SavingAccount_Debits_Amounts',
'Operations_HomeBanking',
'Operations_Mobile',
'CreditCard_Balance_ARG',
'CreditCard_Balance_DOLLAR',
'CreditCard_Total_Limit',
'CreditCard_Total_Spending',
'CreditCard_Spending_1_Installment',
'CreditCard_Spending_CrossBoarder',
'CreditCard_Spending_Aut_Debits',
'CreditCard_Revolving',
'TGT']].copy()
identity_features
| client_id | Target | Month | First_product_dt | Last_product_dt | CreditCard_Premium | CreditCard_Active | Loan_Active | Mortgage_Active | DebitCard_Active | ... | Operations_Mobile | CreditCard_Balance_ARG | CreditCard_Balance_DOLLAR | CreditCard_Total_Limit | CreditCard_Total_Spending | CreditCard_Spending_1_Installment | CreditCard_Spending_CrossBoarder | CreditCard_Spending_Aut_Debits | CreditCard_Revolving | TGT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 5928737 | 0.0 | 2019-01-01 | 2016-08-31 | 2018-12-27 | 0 | 0 | 0 | 0 | 1 | ... | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0000 | 0.00 | 0.0 |
| 8 | 6018047 | 1.0 | 2019-01-01 | 2014-04-29 | 2017-05-31 | 1 | 1 | 0 | 0 | 1 | ... | 0.0 | 16397.20 | 0.00 | 80000.0 | 16068.08 | 1000.00 | 0.00 | 12510.9125 | -14.31 | 1.0 |
| 9 | 5359038 | 1.0 | 2019-01-01 | 2016-01-07 | 2017-05-24 | 1 | 1 | 1 | 0 | 1 | ... | 0.0 | 6906.73 | 33.67 | 64000.0 | 6919.64 | 1840.00 | 33.57 | 299.0000 | 0.00 | 1.0 |
| 11 | 6890812 | 0.0 | 2019-01-01 | 2017-06-09 | 2018-08-27 | 0 | 0 | 1 | 0 | 1 | ... | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0000 | 0.00 | 0.0 |
| 13 | 115383 | 0.0 | 2019-01-01 | 2004-07-30 | 2004-07-30 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0000 | 0.00 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 139131 | 6570413 | 0.0 | 2019-01-01 | 2015-12-04 | 2018-11-02 | 0 | 1 | 0 | 0 | 1 | ... | 1.0 | 537.43 | 0.00 | 28000.0 | 0.00 | 0.00 | 0.00 | 0.0000 | 0.00 | 0.0 |
| 139132 | 6258895 | 0.0 | 2019-01-01 | 2015-03-12 | 2018-03-02 | 0 | 0 | 0 | 0 | 1 | ... | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0000 | 0.00 | 0.0 |
| 139138 | 6397274 | 0.0 | 2019-01-01 | 2015-06-23 | 2015-06-23 | 0 | 1 | 0 | 0 | 0 | ... | 0.0 | 1475.18 | 0.00 | 40000.0 | 1384.27 | 0.00 | 0.00 | 459.0000 | -77.32 | 0.0 |
| 139139 | 6007291 | 0.0 | 2019-01-01 | 2014-04-13 | 2014-04-13 | 1 | 1 | 0 | 0 | 0 | ... | 0.0 | 33148.43 | 0.00 | 96000.0 | 32977.86 | 11385.99 | 0.00 | 12510.9125 | 0.00 | 0.0 |
| 139145 | 6412619 | 0.0 | 2019-01-01 | 2015-07-08 | 2018-06-02 | 0 | 0 | 1 | 0 | 1 | ... | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0000 | 0.00 | 0.0 |
23191 rows × 57 columns
Tratamiento de variables con fechas¶
dateColumns = ['Month','First_product_dt','Last_product_dt']
identity_features.drop(columns=dateColumns, inplace=True)
identity_features
| client_id | Target | CreditCard_Premium | CreditCard_Active | Loan_Active | Mortgage_Active | DebitCard_Active | Investment_Active | Sex | Client_Age_grp | ... | Operations_Mobile | CreditCard_Balance_ARG | CreditCard_Balance_DOLLAR | CreditCard_Total_Limit | CreditCard_Total_Spending | CreditCard_Spending_1_Installment | CreditCard_Spending_CrossBoarder | CreditCard_Spending_Aut_Debits | CreditCard_Revolving | TGT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 5928737 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 30 | ... | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0000 | 0.00 | 0.0 |
| 8 | 6018047 | 1.0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 60 | ... | 0.0 | 16397.20 | 0.00 | 80000.0 | 16068.08 | 1000.00 | 0.00 | 12510.9125 | -14.31 | 1.0 |
| 9 | 5359038 | 1.0 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 40 | ... | 0.0 | 6906.73 | 33.67 | 64000.0 | 6919.64 | 1840.00 | 33.57 | 299.0000 | 0.00 | 1.0 |
| 11 | 6890812 | 0.0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 40 | ... | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0000 | 0.00 | 0.0 |
| 13 | 115383 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 70 | ... | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0000 | 0.00 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 139131 | 6570413 | 0.0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 30 | ... | 1.0 | 537.43 | 0.00 | 28000.0 | 0.00 | 0.00 | 0.00 | 0.0000 | 0.00 | 0.0 |
| 139132 | 6258895 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 30 | ... | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0000 | 0.00 | 0.0 |
| 139138 | 6397274 | 0.0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 40 | ... | 0.0 | 1475.18 | 0.00 | 40000.0 | 1384.27 | 0.00 | 0.00 | 459.0000 | -77.32 | 0.0 |
| 139139 | 6007291 | 0.0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 40 | ... | 0.0 | 33148.43 | 0.00 | 96000.0 | 32977.86 | 11385.99 | 0.00 | 12510.9125 | 0.00 | 0.0 |
| 139145 | 6412619 | 0.0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 60 | ... | 0.0 | 0.00 | 0.00 | 0.0 | 0.00 | 0.00 | 0.00 | 0.0000 | 0.00 | 0.0 |
23191 rows × 54 columns
Transformar varible CreditCard_Product¶
identity_features.CreditCard_Product.value_counts()
CreditCard_Product No 8426 J55660104XX012 8262 J55660202XX012 5759 J55660102XX012 402 J55660702XX012 249 J55661002XX012 62 J55660124XX012 30 J55660123XX012 1 Name: count, dtype: int64
CreditCard_Product_Transform = identity_features[['client_id','CreditCard_Product','TGT']].groupby('CreditCard_Product').agg(['sum',np.count_nonzero])
CreditCard_Product_Transform.columns = ['_'.join(x) for x in np.array(CreditCard_Product_Transform.columns)]
CreditCard_Product_Transform['Porcent_TGT'] = CreditCard_Product_Transform['TGT_sum']/CreditCard_Product_Transform['client_id_count_nonzero']
CreditCard_Product_Transform
| client_id_sum | client_id_count_nonzero | TGT_sum | TGT_count_nonzero | Porcent_TGT | |
|---|---|---|---|---|---|
| CreditCard_Product | |||||
| J55660102XX012 | 1062656436 | 402 | 138.0 | 138 | 0.343284 |
| J55660104XX012 | 34506107357 | 8262 | 3880.0 | 3880 | 0.469620 |
| J55660123XX012 | 994616 | 1 | 0.0 | 0 | 0.000000 |
| J55660124XX012 | 107695565 | 30 | 11.0 | 11 | 0.366667 |
| J55660202XX012 | 23762080087 | 5759 | 2128.0 | 2128 | 0.369509 |
| J55660702XX012 | 1060543322 | 249 | 108.0 | 108 | 0.433735 |
| J55661002XX012 | 204312610 | 62 | 40.0 | 40 | 0.645161 |
| No | 40418495289 | 8426 | 518.0 | 518 | 0.061476 |
di = {
"No" : 6,
"J55660102XX012" : 35,
"J55660104XX012" : 48,
"J55660123XX012" : 0,
"J55660124XX012" : 35,
"J55660202XX012" : 35,
"J55660702XX012" : 48,
"J55661002XX012" : 48,
}
identity_features.CreditCard_Product = identity_features.CreditCard_Product.map(di)
identity_features.CreditCard_Product.value_counts()
CreditCard_Product 48 8573 6 8426 35 6191 0 1 Name: count, dtype: int64
Aggregate Features to ABT¶
columns=[x for x in training_window if(x in ([
'SavingAccount_Days_with_Credits',
'SavingAccount_Days_with_Debits',
'SavingAccount_Salary_Payment_Transactions',
'SavingAccount_Transfer_In_Transactions',
'SavingAccount_ATM_Extraction_Transactions',
'SavingAccount_CreditCard_Payment_Transactions',
'SavingAccount_Transfer_Out_Transactions',
'SavingAccount_DebitCard_Spend_Transactions',
'SavingAccount_Transactions_Transactions',
'SavingAccount_Credits_Transactions',
'SavingAccount_Debits_Transactions',
'SavingAccount_Salary_Payment_Amount',
'SavingAccount_Transfer_In_Amount',
'SavingAccount_ATM_Extraction_Amount',
'SavingAccount_CreditCard_Payment_Amount',
'SavingAccount_Transfer_Out_Amount',
'SavingAccount_DebitCard_Spend_Amount',
'SavingAccount_Total_Amount',
'SavingAccount_Credits_Amounts',
'SavingAccount_Debits_Amounts',
'Operations_HomeBanking',
'Operations_Mobile',
'CreditCard_Balance_ARG',
'CreditCard_Balance_DOLLAR',
'CreditCard_Total_Limit',
'CreditCard_Total_Spending',
'CreditCard_Spending_1_Installment',
'CreditCard_Spending_CrossBoarder',
'CreditCard_Spending_Aut_Debits',
'CreditCard_Revolving']))]
len(columns)
30
aggregateFeatures = training_window.groupby('client_id')[columns].agg(['sum','max','min', 'mean','nunique', np.count_nonzero,'var']).reset_index()
aggregateFeatures.columns = ['_'.join(x) for x in np.array(aggregateFeatures.columns)]
aggregateFeatures.rename(columns={'client_id_':'client_id'}, inplace=True)
aggregateFeatures
| client_id | SavingAccount_Days_with_Credits_sum | SavingAccount_Days_with_Credits_max | SavingAccount_Days_with_Credits_min | SavingAccount_Days_with_Credits_mean | SavingAccount_Days_with_Credits_nunique | SavingAccount_Days_with_Credits_count_nonzero | SavingAccount_Days_with_Credits_var | SavingAccount_Days_with_Debits_sum | SavingAccount_Days_with_Debits_max | ... | CreditCard_Spending_Aut_Debits_nunique | CreditCard_Spending_Aut_Debits_count_nonzero | CreditCard_Spending_Aut_Debits_var | CreditCard_Revolving_sum | CreditCard_Revolving_max | CreditCard_Revolving_min | CreditCard_Revolving_mean | CreditCard_Revolving_nunique | CreditCard_Revolving_count_nonzero | CreditCard_Revolving_var | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1030 | 13.0 | 3.0 | 1.0 | 2.166667 | 3 | 6 | 0.566667 | 17.000000 | 4.000000 | ... | 2 | 6 | 337.500000 | 0.00 | 0.00 | 0.00 | 0.000000 | 1 | 0 | 0.000000e+00 |
| 1 | 1094 | 0.0 | 0.0 | 0.0 | 0.000000 | 1 | 0 | 0.000000 | 0.000000 | 0.000000 | ... | 1 | 0 | 0.000000 | 7824.34 | 4121.95 | -86.97 | 1304.056667 | 6 | 6 | 2.834001e+06 |
| 2 | 1553 | 0.0 | 0.0 | 0.0 | 0.000000 | 1 | 0 | 0.000000 | 0.000000 | 0.000000 | ... | 1 | 0 | 0.000000 | 0.00 | 0.00 | 0.00 | 0.000000 | 1 | 0 | 0.000000e+00 |
| 3 | 1590 | 0.0 | 0.0 | 0.0 | 0.000000 | 1 | 0 | 0.000000 | 0.000000 | 0.000000 | ... | 3 | 6 | 1414.566667 | 2874.30 | 2486.88 | -45.10 | 479.050000 | 5 | 4 | 9.991904e+05 |
| 4 | 1948 | 8.0 | 2.0 | 1.0 | 1.333333 | 2 | 6 | 0.266667 | 11.000000 | 2.000000 | ... | 6 | 6 | 187224.700000 | 20378.64 | 9969.35 | -0.66 | 3396.440000 | 5 | 4 | 1.874074e+07 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23186 | 7344944 | 7.0 | 5.0 | 0.0 | 1.166667 | 3 | 2 | 4.166667 | 8.000000 | 6.000000 | ... | 1 | 0 | 0.000000 | 0.00 | 0.00 | 0.00 | 0.000000 | 1 | 0 | 0.000000e+00 |
| 23187 | 7345011 | 8.0 | 5.0 | 0.0 | 1.333333 | 3 | 2 | 4.666667 | 9.000000 | 5.000000 | ... | 1 | 0 | 0.000000 | 0.00 | 0.00 | 0.00 | 0.000000 | 1 | 0 | 0.000000e+00 |
| 23188 | 7345026 | 15.0 | 5.0 | 0.0 | 2.500000 | 2 | 3 | 7.500000 | 12.000000 | 7.000000 | ... | 1 | 0 | 0.000000 | 0.00 | 0.00 | 0.00 | 0.000000 | 1 | 0 | 0.000000e+00 |
| 23189 | 7345029 | 6.0 | 3.0 | 0.0 | 1.000000 | 2 | 2 | 2.400000 | 10.000000 | 6.000000 | ... | 1 | 0 | 0.000000 | 0.00 | 0.00 | 0.00 | 0.000000 | 1 | 0 | 0.000000e+00 |
| 23190 | 7345492 | 10.0 | 5.0 | 0.0 | 1.666667 | 4 | 3 | 4.266667 | 18.723007 | 9.723007 | ... | 1 | 0 | 0.000000 | 0.00 | 0.00 | 0.00 | 0.000000 | 1 | 0 | 0.000000e+00 |
23191 rows × 211 columns
Operations = training_window[['client_id','Operations_Bank','Operations_Terminal','Operations_HomeBanking',
'Operations_Mobile','Operations_Ivr','Operations_Telemarketer','Operations_ATM']]
Operations['TotalOperations'] = Operations['Operations_Bank'] + Operations['Operations_Terminal'] \
+ Operations['Operations_HomeBanking'] + Operations['Operations_Mobile'] \
+ Operations['Operations_Ivr'] + Operations['Operations_Telemarketer'] \
+ Operations['Operations_ATM']
Operations_Aggregate = Operations.groupby('client_id').agg('sum').reset_index()
Operations_Aggregate['Porcent_Operations_HomeBanking'] = np.where(Operations_Aggregate['TotalOperations'] == 0,0,Operations_Aggregate['Operations_HomeBanking']/Operations_Aggregate['TotalOperations'])
Operations_Aggregate['Porcent_Operations_Mobile'] = np.where(Operations_Aggregate['TotalOperations'] == 0,0,Operations_Aggregate['Operations_Mobile']/Operations_Aggregate['TotalOperations'])
Operations_Aggregate
C:\Users\tutem\AppData\Local\Temp\ipykernel_22168\2919754844.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy Operations['TotalOperations'] = Operations['Operations_Bank'] + Operations['Operations_Terminal'] \
| client_id | Operations_Bank | Operations_Terminal | Operations_HomeBanking | Operations_Mobile | Operations_Ivr | Operations_Telemarketer | Operations_ATM | TotalOperations | Porcent_Operations_HomeBanking | Porcent_Operations_Mobile | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1030 | 0.0 | 0.0 | 6.0 | 1.0 | 0.0 | 1.0 | 0.0 | 8.0 | 0.750000 | 0.125000 |
| 1 | 1094 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.000000 | 0.000000 |
| 2 | 1553 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.000000 | 0.000000 |
| 3 | 1590 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 2.0 | 0.000000 | 0.000000 |
| 4 | 1948 | 0.0 | 0.0 | 8.0 | 1.0 | 0.0 | 0.0 | 0.0 | 9.0 | 0.888889 | 0.111111 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23186 | 7344944 | 0.0 | 1.0 | 3.0 | 30.0 | 0.0 | 0.0 | 8.0 | 42.0 | 0.071429 | 0.714286 |
| 23187 | 7345011 | 2.0 | 1.0 | 1.0 | 3.0 | 0.0 | 0.0 | 7.0 | 14.0 | 0.071429 | 0.214286 |
| 23188 | 7345026 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 5.0 | 0.000000 | 0.000000 |
| 23189 | 7345029 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 9.0 | 0.000000 | 0.000000 |
| 23190 | 7345492 | 2.0 | 0.0 | 11.0 | 22.0 | 0.0 | 0.0 | 0.0 | 35.0 | 0.314286 | 0.628571 |
23191 rows × 11 columns
Insurances = training_window[['client_id','Insurance_Life','Insurance_Home','Insurance_Accidents',
'Insurance_Mobile','Insurance_ATM','Insurance_Unemployment']]
Insurances['TotalInsurances'] = Insurances['Insurance_Life'] + Insurances['Insurance_Home'] \
+ Insurances['Insurance_Accidents'] + Insurances['Insurance_Mobile'] \
+ Insurances['Insurance_ATM'] + Insurances['Insurance_Unemployment']
Insurances_Aggregate = Insurances.groupby('client_id').agg('sum').reset_index()
Insurances_Aggregate['Porcent_Total_Insurances'] = Insurances_Aggregate['TotalInsurances'] / 5
#Insurances_Aggregate = Insurances_Aggregate[['client_id','TotalInsurances','Porcent_Total_Insurances']]
Insurances_Aggregate
C:\Users\tutem\AppData\Local\Temp\ipykernel_22168\4288838385.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy Insurances['TotalInsurances'] = Insurances['Insurance_Life'] + Insurances['Insurance_Home'] \
| client_id | Insurance_Life | Insurance_Home | Insurance_Accidents | Insurance_Mobile | Insurance_ATM | Insurance_Unemployment | TotalInsurances | Porcent_Total_Insurances | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1030 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0.2 |
| 1 | 1094 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 2 | 1553 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 3 | 1590 | 1 | 0 | 1 | 0 | 0 | 0 | 2 | 0.4 |
| 4 | 1948 | 0 | 1 | 0 | 0 | 1 | 0 | 2 | 0.4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23186 | 7344944 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 23187 | 7345011 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0.2 |
| 23188 | 7345026 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 23189 | 7345029 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 23190 | 7345492 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 |
23191 rows × 9 columns
Create ABT¶
ABT = identity_features.merge(aggregateFeatures, how='inner', on='client_id')\
.merge(Operations_Aggregate, how='inner', on='client_id')\
.merge(Insurances_Aggregate, how='inner', on='client_id')
ABT.drop(columns=['Target'], inplace=True)
ABT
| client_id | CreditCard_Premium | CreditCard_Active | Loan_Active | Mortgage_Active | DebitCard_Active | Investment_Active | Sex | Client_Age_grp | Mobile | ... | Porcent_Operations_HomeBanking | Porcent_Operations_Mobile | Insurance_Life | Insurance_Home | Insurance_Accidents | Insurance_Mobile | Insurance_ATM | Insurance_Unemployment | TotalInsurances | Porcent_Total_Insurances | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5928737 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 30 | 1 | ... | 0.000000 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 1 | 6018047 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 60 | 1 | ... | 0.727273 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 2 | 5359038 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 40 | 1 | ... | 0.915663 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 3 | 6890812 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 40 | 1 | ... | 0.972603 | 0.0 | 1 | 0 | 0 | 0 | 0 | 1 | 2 | 0.4 |
| 4 | 115383 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 70 | 0 | ... | 0.000000 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23186 | 6570413 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 30 | 1 | ... | 0.300000 | 0.2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 23187 | 6258895 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 30 | 1 | ... | 0.000000 | 0.0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0.2 |
| 23188 | 6397274 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 40 | 0 | ... | 0.000000 | 0.0 | 1 | 0 | 1 | 0 | 0 | 0 | 2 | 0.4 |
| 23189 | 6007291 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 40 | 0 | ... | 0.000000 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 |
| 23190 | 6412619 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 60 | 1 | ... | 0.000000 | 0.0 | 1 | 1 | 0 | 0 | 1 | 0 | 3 | 0.6 |
23191 rows × 281 columns
Dimensionality Reduction¶
Minimum = Maximum¶
ABT.describe()
| client_id | CreditCard_Premium | CreditCard_Active | Loan_Active | Mortgage_Active | DebitCard_Active | Investment_Active | Sex | Client_Age_grp | Mobile | ... | Porcent_Operations_HomeBanking | Porcent_Operations_Mobile | Insurance_Life | Insurance_Home | Insurance_Accidents | Insurance_Mobile | Insurance_ATM | Insurance_Unemployment | TotalInsurances | Porcent_Total_Insurances | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.319100e+04 | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 | ... | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 | 23191.000000 |
| mean | 4.360437e+06 | 0.115346 | 0.647794 | 0.127981 | 0.022379 | 0.669527 | 0.054763 | 0.573541 | 44.405459 | 0.881075 | ... | 0.283736 | 0.078601 | 0.193437 | 0.145272 | 0.156268 | 0.076409 | 0.143978 | 0.077918 | 0.793282 | 0.158656 |
| std | 2.144133e+06 | 0.319446 | 0.477668 | 0.334075 | 0.147917 | 0.470394 | 0.227521 | 0.494573 | 13.453333 | 0.323708 | ... | 0.380932 | 0.213620 | 0.395001 | 0.352382 | 0.363117 | 0.265657 | 0.351075 | 0.268049 | 1.174822 | 0.234964 |
| min | 1.030000e+03 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 18.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 2.525100e+06 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 30.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 4.991596e+06 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 40.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 6.154208e+06 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 50.000000 | 1.000000 | ... | 0.619048 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.200000 |
| max | 7.345492e+06 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 70.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 6.000000 | 1.200000 |
8 rows × 281 columns
x = pd.DataFrame(ABT.describe().T)
x.columns
delete = x[x['min']==x['max']].reset_index()
print('ABT min/max: ', ABT.shape)
ABT.drop(delete['index'], axis=1, inplace=True)
print('ABT: ', ABT.shape)
ABT min/max: (23191, 281) ABT: (23191, 281)
PCA¶
#to scale the data using z-score
from sklearn.preprocessing import StandardScaler
cols = [x for x in ABT.columns if (x.startswith('CreditCard_Spending')) ]
print(cols)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(ABT[cols])
type(data_scaled)
['CreditCard_Spending_1_Installment', 'CreditCard_Spending_CrossBoarder', 'CreditCard_Spending_Aut_Debits', 'CreditCard_Spending_1_Installment_sum', 'CreditCard_Spending_1_Installment_max', 'CreditCard_Spending_1_Installment_min', 'CreditCard_Spending_1_Installment_mean', 'CreditCard_Spending_1_Installment_nunique', 'CreditCard_Spending_1_Installment_count_nonzero', 'CreditCard_Spending_1_Installment_var', 'CreditCard_Spending_CrossBoarder_sum', 'CreditCard_Spending_CrossBoarder_max', 'CreditCard_Spending_CrossBoarder_min', 'CreditCard_Spending_CrossBoarder_mean', 'CreditCard_Spending_CrossBoarder_nunique', 'CreditCard_Spending_CrossBoarder_count_nonzero', 'CreditCard_Spending_CrossBoarder_var', 'CreditCard_Spending_Aut_Debits_sum', 'CreditCard_Spending_Aut_Debits_max', 'CreditCard_Spending_Aut_Debits_min', 'CreditCard_Spending_Aut_Debits_mean', 'CreditCard_Spending_Aut_Debits_nunique', 'CreditCard_Spending_Aut_Debits_count_nonzero', 'CreditCard_Spending_Aut_Debits_var']
numpy.ndarray
data_scaled = pd.DataFrame(data_scaled, columns=ABT[cols].columns)
data_scaled.head(5)
| CreditCard_Spending_1_Installment | CreditCard_Spending_CrossBoarder | CreditCard_Spending_Aut_Debits | CreditCard_Spending_1_Installment_sum | CreditCard_Spending_1_Installment_max | CreditCard_Spending_1_Installment_min | CreditCard_Spending_1_Installment_mean | CreditCard_Spending_1_Installment_nunique | CreditCard_Spending_1_Installment_count_nonzero | CreditCard_Spending_1_Installment_var | ... | CreditCard_Spending_CrossBoarder_nunique | CreditCard_Spending_CrossBoarder_count_nonzero | CreditCard_Spending_CrossBoarder_var | CreditCard_Spending_Aut_Debits_sum | CreditCard_Spending_Aut_Debits_max | CreditCard_Spending_Aut_Debits_min | CreditCard_Spending_Aut_Debits_mean | CreditCard_Spending_Aut_Debits_nunique | CreditCard_Spending_Aut_Debits_count_nonzero | CreditCard_Spending_Aut_Debits_var | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.404539 | -0.174669 | -0.473121 | -0.482228 | -0.595466 | -0.206982 | -0.482228 | -0.857834 | -0.857969 | -0.369477 | ... | -0.325674 | -0.308900 | -0.248440 | -0.483256 | -0.509510 | -0.427386 | -0.483256 | -0.717120 | -0.909830 | -0.184095 |
| 1 | -0.121738 | -0.174669 | 4.916441 | 1.349612 | 2.198174 | -0.206982 | 1.349612 | 1.619630 | 1.156892 | 1.813086 | ... | -0.325674 | -0.308900 | -0.248440 | 5.305475 | 4.312784 | 6.166697 | 5.305475 | -0.181371 | 1.155444 | -0.131284 |
| 2 | 0.115814 | 1.574087 | -0.344315 | -0.178073 | -0.230906 | -0.206982 | -0.178073 | 1.619630 | 1.156892 | -0.333223 | ... | 1.156702 | 0.664959 | -0.023213 | -0.366725 | -0.394261 | -0.427386 | -0.366725 | -0.181371 | 0.811232 | -0.176727 |
| 3 | -0.404539 | -0.174669 | -0.473121 | -0.482228 | -0.595466 | -0.206982 | -0.482228 | -0.857834 | -0.857969 | -0.369477 | ... | -0.325674 | -0.308900 | -0.248440 | -0.483256 | -0.509510 | -0.427386 | -0.483256 | -0.717120 | -0.909830 | -0.184095 |
| 4 | -0.404539 | -0.174669 | -0.473121 | -0.482228 | -0.595466 | -0.206982 | -0.482228 | -0.857834 | -0.857969 | -0.369477 | ... | -0.325674 | -0.308900 | -0.248440 | -0.483256 | -0.509510 | -0.427386 | -0.483256 | -0.717120 | -0.909830 | -0.184095 |
5 rows × 24 columns
data_scaled.describe()
| CreditCard_Spending_1_Installment | CreditCard_Spending_CrossBoarder | CreditCard_Spending_Aut_Debits | CreditCard_Spending_1_Installment_sum | CreditCard_Spending_1_Installment_max | CreditCard_Spending_1_Installment_min | CreditCard_Spending_1_Installment_mean | CreditCard_Spending_1_Installment_nunique | CreditCard_Spending_1_Installment_count_nonzero | CreditCard_Spending_1_Installment_var | ... | CreditCard_Spending_CrossBoarder_nunique | CreditCard_Spending_CrossBoarder_count_nonzero | CreditCard_Spending_CrossBoarder_var | CreditCard_Spending_Aut_Debits_sum | CreditCard_Spending_Aut_Debits_max | CreditCard_Spending_Aut_Debits_min | CreditCard_Spending_Aut_Debits_mean | CreditCard_Spending_Aut_Debits_nunique | CreditCard_Spending_Aut_Debits_count_nonzero | CreditCard_Spending_Aut_Debits_var | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | ... | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 | 2.319100e+04 |
| mean | 5.361777e-18 | -5.943913e-17 | 5.469013e-17 | 5.208584e-17 | 2.083433e-17 | 2.451098e-17 | 4.166867e-17 | -6.556688e-17 | 1.715769e-17 | -1.945559e-17 | ... | -1.862835e-16 | 4.657087e-17 | -6.403494e-17 | 4.274103e-17 | 1.363423e-17 | -8.885231e-18 | 1.838324e-17 | -1.286827e-17 | -2.849402e-17 | 2.757485e-18 |
| std | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | ... | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 | 1.000022e+00 |
| min | -5.970550e+00 | -1.746695e-01 | -4.555431e+00 | -7.962067e+00 | -4.495013e+00 | -1.319936e+01 | -7.962067e+00 | -8.578344e-01 | -8.579690e-01 | -3.694769e-01 | ... | -3.256741e-01 | -3.089002e-01 | -2.484396e-01 | -1.050400e+00 | -5.095100e-01 | -5.763493e+00 | -1.050400e+00 | -7.171202e-01 | -9.098305e-01 | -1.840948e-01 |
| 25% | -4.045387e-01 | -1.746695e-01 | -4.731206e-01 | -4.822276e-01 | -5.954658e-01 | -2.069816e-01 | -4.822276e-01 | -8.578344e-01 | -8.579690e-01 | -3.694769e-01 | ... | -3.256741e-01 | -3.089002e-01 | -2.484396e-01 | -4.832563e-01 | -5.095100e-01 | -4.273857e-01 | -4.832563e-01 | -7.171202e-01 | -9.098305e-01 | -1.840948e-01 |
| 50% | -4.045387e-01 | -1.746695e-01 | -4.731206e-01 | -4.631958e-01 | -5.520753e-01 | -2.069816e-01 | -4.631958e-01 | -3.623415e-01 | -4.549967e-01 | -3.693950e-01 | ... | -3.256741e-01 | -3.089002e-01 | -2.484396e-01 | -4.832563e-01 | -5.095100e-01 | -4.273857e-01 | -4.832563e-01 | -7.171202e-01 | -9.098305e-01 | -1.840948e-01 |
| 75% | -1.798650e-01 | -1.746695e-01 | -4.200653e-02 | 1.850608e-02 | 1.678342e-01 | -2.069816e-01 | 1.850608e-02 | 1.124137e+00 | 1.156892e+00 | -2.023504e-01 | ... | -3.256741e-01 | -3.089002e-01 | -2.484396e-01 | -7.949366e-03 | 3.423440e-02 | -8.840142e-02 | -7.949366e-03 | 8.901281e-01 | 1.155444e+00 | -1.696582e-01 |
| max | 5.161473e+00 | 7.306930e+00 | 4.916441e+00 | 6.997612e+00 | 3.304082e+00 | 1.278540e+01 | 6.997612e+00 | 1.619630e+00 | 1.559865e+00 | 1.394463e+01 | ... | 7.086209e+00 | 5.534253e+00 | 7.171873e+00 | 5.367874e+00 | 4.312784e+00 | 6.617469e+00 | 5.367874e+00 | 1.961627e+00 | 1.155444e+00 | 2.346509e+01 |
8 rows × 24 columns
#Importing PCA and TSNE
from sklearn.decomposition import PCA
#Defining the number of principal components to generate
n = data_scaled.shape[1]
#Finding principal components for the data
pca1 = PCA(n_components=n, random_state=1)
data_pca = pd.DataFrame(pca1.fit_transform(data_scaled))
#The percentage of variance explained by each principal component
exp_var1 = pca1.explained_variance_ratio_
pca1.explained_variance_ratio_.cumsum()
array([0.38312432, 0.56622212, 0.6940415 , 0.74982331, 0.79421735,
0.83520993, 0.87372543, 0.90582188, 0.93159319, 0.95309944,
0.96551444, 0.97468403, 0.98155217, 0.98748567, 0.99094997,
0.99363639, 0.99602674, 0.99733536, 0.99841912, 0.99946476,
1. , 1. , 1. , 1. ])
import matplotlib.pyplot as plt
import seaborn as sns
# visulaize the explained variance by individual components
plt.figure(figsize = (10,10))
plt.plot(range(1,n+1), pca1.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--')
plt.title("Explained Variances by Components")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
Text(0, 0.5, 'Cumulative Explained Variance')
# find the least number of components that can explain more than 80% variance
sum = 0
for ix, i in enumerate(exp_var1):
sum = sum + i
if(sum>0.80):
print("Number of PCs that explain at least 80% variance: ", ix+1)
break
Number of PCs that explain at least 80% variance: 6
# find the least number of components that can explain more than 70% variance
sum = 0
for ix, i in enumerate(exp_var1):
sum = sum + i
if(sum>0.70):
print("Number of PCs that explain at least 70% variance: ", ix+1)
break
Number of PCs that explain at least 70% variance: 4
cols = ['PC1', 'PC2', 'PC3', 'PC4']
pc1 = pd.DataFrame(np.round(pca1.components_.T[:, 0:4],2), index=data_scaled.columns, columns=cols)
def color_high(val):
if val <= -0.25: # you can decide any value as per your understanding
return 'background: pink'
elif val >= 0.25:
return 'background: skyblue'
pc1.style.applymap(color_high)
| PC1 | PC2 | PC3 | PC4 | |
|---|---|---|---|---|
| CreditCard_Spending_1_Installment | 0.210000 | 0.000000 | -0.300000 | 0.180000 |
| CreditCard_Spending_CrossBoarder | 0.140000 | 0.240000 | 0.130000 | 0.050000 |
| CreditCard_Spending_Aut_Debits | 0.240000 | -0.250000 | 0.200000 | 0.110000 |
| CreditCard_Spending_1_Installment_sum | 0.250000 | 0.010000 | -0.330000 | 0.180000 |
| CreditCard_Spending_1_Installment_max | 0.250000 | 0.020000 | -0.310000 | -0.000000 |
| CreditCard_Spending_1_Installment_min | 0.160000 | -0.000000 | -0.260000 | 0.380000 |
| CreditCard_Spending_1_Installment_mean | 0.250000 | 0.010000 | -0.330000 | 0.180000 |
| CreditCard_Spending_1_Installment_nunique | 0.220000 | -0.030000 | -0.190000 | -0.470000 |
| CreditCard_Spending_1_Installment_count_nonzero | 0.230000 | -0.040000 | -0.190000 | -0.430000 |
| CreditCard_Spending_1_Installment_var | 0.180000 | 0.040000 | -0.230000 | -0.010000 |
| CreditCard_Spending_CrossBoarder_sum | 0.190000 | 0.330000 | 0.180000 | 0.040000 |
| CreditCard_Spending_CrossBoarder_max | 0.200000 | 0.310000 | 0.160000 | 0.020000 |
| CreditCard_Spending_CrossBoarder_min | 0.060000 | 0.130000 | 0.090000 | 0.020000 |
| CreditCard_Spending_CrossBoarder_mean | 0.190000 | 0.330000 | 0.180000 | 0.040000 |
| CreditCard_Spending_CrossBoarder_nunique | 0.190000 | 0.250000 | 0.110000 | -0.090000 |
| CreditCard_Spending_CrossBoarder_count_nonzero | 0.180000 | 0.240000 | 0.120000 | -0.080000 |
| CreditCard_Spending_CrossBoarder_var | 0.180000 | 0.300000 | 0.150000 | 0.050000 |
| CreditCard_Spending_Aut_Debits_sum | 0.240000 | -0.260000 | 0.210000 | 0.120000 |
| CreditCard_Spending_Aut_Debits_max | 0.240000 | -0.260000 | 0.210000 | 0.100000 |
| CreditCard_Spending_Aut_Debits_min | 0.220000 | -0.240000 | 0.200000 | 0.130000 |
| CreditCard_Spending_Aut_Debits_mean | 0.240000 | -0.260000 | 0.210000 | 0.120000 |
| CreditCard_Spending_Aut_Debits_nunique | 0.210000 | -0.180000 | 0.090000 | -0.300000 |
| CreditCard_Spending_Aut_Debits_count_nonzero | 0.200000 | -0.150000 | 0.050000 | -0.400000 |
| CreditCard_Spending_Aut_Debits_var | 0.090000 | -0.120000 | 0.100000 | 0.130000 |
Correlated Features¶
ABT.corr()
| client_id | CreditCard_Premium | CreditCard_Active | Loan_Active | Mortgage_Active | DebitCard_Active | Investment_Active | Sex | Client_Age_grp | Mobile | ... | Porcent_Operations_HomeBanking | Porcent_Operations_Mobile | Insurance_Life | Insurance_Home | Insurance_Accidents | Insurance_Mobile | Insurance_ATM | Insurance_Unemployment | TotalInsurances | Porcent_Total_Insurances | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| client_id | 1.000000 | 0.044398 | -0.148117 | 0.008420 | -0.125800 | 0.149141 | -0.035678 | -0.058477 | -0.302570 | 0.038438 | ... | 0.003432 | 0.086509 | -0.006130 | -0.075573 | -0.030472 | -0.036124 | 0.013505 | 0.012061 | -0.035528 | -0.035528 |
| CreditCard_Premium | 0.044398 | 1.000000 | 0.254667 | -0.007414 | -0.015391 | -0.099862 | -0.035889 | -0.026809 | 0.020644 | 0.003805 | ... | 0.033540 | 0.028793 | 0.029580 | 0.021988 | 0.042374 | 0.079069 | -0.047733 | -0.007771 | 0.031480 | 0.031480 |
| CreditCard_Active | -0.148117 | 0.254667 | 1.000000 | 0.038196 | -0.052002 | -0.285630 | -0.038765 | -0.017215 | 0.172011 | 0.012443 | ... | 0.111196 | 0.078426 | 0.155410 | 0.158473 | 0.174625 | 0.191357 | -0.019796 | 0.013618 | 0.194221 | 0.194221 |
| Loan_Active | 0.008420 | -0.007414 | 0.038196 | 1.000000 | -0.037892 | 0.223872 | -0.040017 | -0.011554 | 0.063915 | 0.065383 | ... | 0.005896 | 0.058987 | 0.270208 | 0.158914 | 0.273786 | 0.067158 | 0.217906 | 0.708234 | 0.465033 | 0.465033 |
| Mortgage_Active | -0.125800 | -0.015391 | -0.052002 | -0.037892 | 1.000000 | 0.002179 | 0.019961 | 0.034384 | 0.002331 | -0.029069 | ... | 0.056854 | 0.005701 | 0.009304 | 0.045174 | -0.003294 | -0.021570 | 0.045069 | -0.035281 | 0.016201 | 0.016201 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Insurance_Mobile | -0.036124 | 0.079069 | 0.191357 | 0.067158 | -0.021570 | -0.058112 | -0.019291 | 0.011712 | 0.132112 | 0.040485 | ... | -0.011414 | 0.007520 | 0.175155 | 0.250395 | 0.230260 | 1.000000 | 0.021671 | 0.060514 | 0.451573 | 0.451573 |
| Insurance_ATM | 0.013505 | -0.047733 | -0.019796 | 0.217906 | 0.045069 | 0.243219 | 0.037870 | 0.012652 | -0.054322 | 0.058469 | ... | 0.128352 | 0.097599 | 0.233253 | 0.149513 | 0.146543 | 0.021671 | 1.000000 | 0.176342 | 0.512532 | 0.512532 |
| Insurance_Unemployment | 0.012061 | -0.007771 | 0.013618 | 0.708234 | -0.035281 | 0.168320 | -0.038151 | -0.000777 | -0.023123 | 0.051635 | ... | 0.007100 | 0.051294 | 0.271025 | 0.107511 | 0.298883 | 0.060514 | 0.176342 | 1.000000 | 0.510293 | 0.510293 |
| TotalInsurances | -0.035528 | 0.031480 | 0.194221 | 0.465033 | 0.016201 | 0.124437 | -0.032341 | 0.032548 | 0.020222 | 0.085595 | ... | 0.088078 | 0.091103 | 0.710251 | 0.589711 | 0.687889 | 0.451573 | 0.512532 | 0.510293 | 1.000000 | 1.000000 |
| Porcent_Total_Insurances | -0.035528 | 0.031480 | 0.194221 | 0.465033 | 0.016201 | 0.124437 | -0.032341 | 0.032548 | 0.020222 | 0.085595 | ... | 0.088078 | 0.091103 | 0.710251 | 0.589711 | 0.687889 | 0.451573 | 0.512532 | 0.510293 | 1.000000 | 1.000000 |
281 rows × 281 columns
# if Inplace is False, return a copy, default drop(inplace=False)
corr_Matrix = ABT.drop(['client_id','TGT'], axis=1).corr().abs()
corr_Matrix
| CreditCard_Premium | CreditCard_Active | Loan_Active | Mortgage_Active | DebitCard_Active | Investment_Active | Sex | Client_Age_grp | Mobile | ... | Porcent_Operations_HomeBanking | Porcent_Operations_Mobile | Insurance_Life | Insurance_Home | Insurance_Accidents | Insurance_Mobile | Insurance_ATM | Insurance_Unemployment | TotalInsurances | Porcent_Total_Insurances | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CreditCard_Premium | 1.000000 | 0.254667 | 0.007414 | 0.015391 | 0.099862 | 0.035889 | 0.026809 | 0.020644 | 0.003805 | 0.088081 | ... | 0.033540 | 0.028793 | 0.029580 | 0.021988 | 0.042374 | 0.079069 | 0.047733 | 0.007771 | 0.031480 | 0.031480 |
| CreditCard_Active | 0.254667 | 1.000000 | 0.038196 | 0.052002 | 0.285630 | 0.038765 | 0.017215 | 0.172011 | 0.012443 | 0.089459 | ... | 0.111196 | 0.078426 | 0.155410 | 0.158473 | 0.174625 | 0.191357 | 0.019796 | 0.013618 | 0.194221 | 0.194221 |
| Loan_Active | 0.007414 | 0.038196 | 1.000000 | 0.037892 | 0.223872 | 0.040017 | 0.011554 | 0.063915 | 0.065383 | 0.012032 | ... | 0.005896 | 0.058987 | 0.270208 | 0.158914 | 0.273786 | 0.067158 | 0.217906 | 0.708234 | 0.465033 | 0.465033 |
| Mortgage_Active | 0.015391 | 0.052002 | 0.037892 | 1.000000 | 0.002179 | 0.019961 | 0.034384 | 0.002331 | 0.029069 | 0.007549 | ... | 0.056854 | 0.005701 | 0.009304 | 0.045174 | 0.003294 | 0.021570 | 0.045069 | 0.035281 | 0.016201 | 0.016201 |
| DebitCard_Active | 0.099862 | 0.285630 | 0.223872 | 0.002179 | 1.000000 | 0.148153 | 0.022313 | 0.141464 | 0.118533 | 0.089266 | ... | 0.178398 | 0.164068 | 0.051871 | 0.011021 | 0.018590 | 0.058112 | 0.243219 | 0.168320 | 0.124437 | 0.124437 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Insurance_Mobile | 0.079069 | 0.191357 | 0.067158 | 0.021570 | 0.058112 | 0.019291 | 0.011712 | 0.132112 | 0.040485 | 0.019767 | ... | 0.011414 | 0.007520 | 0.175155 | 0.250395 | 0.230260 | 1.000000 | 0.021671 | 0.060514 | 0.451573 | 0.451573 |
| Insurance_ATM | 0.047733 | 0.019796 | 0.217906 | 0.045069 | 0.243219 | 0.037870 | 0.012652 | 0.054322 | 0.058469 | 0.060119 | ... | 0.128352 | 0.097599 | 0.233253 | 0.149513 | 0.146543 | 0.021671 | 1.000000 | 0.176342 | 0.512532 | 0.512532 |
| Insurance_Unemployment | 0.007771 | 0.013618 | 0.708234 | 0.035281 | 0.168320 | 0.038151 | 0.000777 | 0.023123 | 0.051635 | 0.012599 | ... | 0.007100 | 0.051294 | 0.271025 | 0.107511 | 0.298883 | 0.060514 | 0.176342 | 1.000000 | 0.510293 | 0.510293 |
| TotalInsurances | 0.031480 | 0.194221 | 0.465033 | 0.016201 | 0.124437 | 0.032341 | 0.032548 | 0.020222 | 0.085595 | 0.060886 | ... | 0.088078 | 0.091103 | 0.710251 | 0.589711 | 0.687889 | 0.451573 | 0.512532 | 0.510293 | 1.000000 | 1.000000 |
| Porcent_Total_Insurances | 0.031480 | 0.194221 | 0.465033 | 0.016201 | 0.124437 | 0.032341 | 0.032548 | 0.020222 | 0.085595 | 0.060886 | ... | 0.088078 | 0.091103 | 0.710251 | 0.589711 | 0.687889 | 0.451573 | 0.512532 | 0.510293 | 1.000000 | 1.000000 |
279 rows × 279 columns
sns.heatmap(corr_Matrix, annot=True)
<Axes: >
upper = corr_Matrix.where(np.triu(np.ones(corr_Matrix.shape),k=1).astype(bool)).fillna(0)
## features to drop, because correlation is up to 0.8
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
#len(to_drop)
to_drop
['CreditCard_Product', 'SavingAccount_Active_ARG', 'SavingAccount_Days_with_Debits', 'SavingAccount_Transactions_Transactions', 'SavingAccount_Credits_Transactions', 'SavingAccount_Debits_Transactions', 'SavingAccount_Salary_Payment_Amount', 'SavingAccount_Transfer_In_Amount', 'SavingAccount_ATM_Extraction_Amount', 'SavingAccount_DebitCard_Spend_Amount', 'SavingAccount_Credits_Amounts', 'SavingAccount_Debits_Amounts', 'CreditCard_Spending_1_Installment', 'CreditCard_Spending_CrossBoarder', 'SavingAccount_Days_with_Credits_sum', 'SavingAccount_Days_with_Credits_max', 'SavingAccount_Days_with_Credits_min', 'SavingAccount_Days_with_Credits_mean', 'SavingAccount_Days_with_Credits_nunique', 'SavingAccount_Days_with_Debits_sum', 'SavingAccount_Days_with_Debits_max', 'SavingAccount_Days_with_Debits_min', 'SavingAccount_Days_with_Debits_mean', 'SavingAccount_Days_with_Debits_count_nonzero', 'SavingAccount_Salary_Payment_Transactions_sum', 'SavingAccount_Salary_Payment_Transactions_max', 'SavingAccount_Salary_Payment_Transactions_min', 'SavingAccount_Salary_Payment_Transactions_mean', 'SavingAccount_Salary_Payment_Transactions_nunique', 'SavingAccount_Salary_Payment_Transactions_count_nonzero', 'SavingAccount_Salary_Payment_Transactions_var', 'SavingAccount_Transfer_In_Transactions_sum', 'SavingAccount_Transfer_In_Transactions_max', 'SavingAccount_Transfer_In_Transactions_mean', 'SavingAccount_Transfer_In_Transactions_nunique', 'SavingAccount_Transfer_In_Transactions_count_nonzero', 'SavingAccount_Transfer_In_Transactions_var', 'SavingAccount_ATM_Extraction_Transactions_sum', 'SavingAccount_ATM_Extraction_Transactions_max', 'SavingAccount_ATM_Extraction_Transactions_min', 'SavingAccount_ATM_Extraction_Transactions_mean', 'SavingAccount_ATM_Extraction_Transactions_count_nonzero', 'SavingAccount_ATM_Extraction_Transactions_var', 'SavingAccount_CreditCard_Payment_Transactions_sum', 'SavingAccount_CreditCard_Payment_Transactions_max', 'SavingAccount_CreditCard_Payment_Transactions_min', 'SavingAccount_CreditCard_Payment_Transactions_mean', 'SavingAccount_CreditCard_Payment_Transactions_nunique', 'SavingAccount_CreditCard_Payment_Transactions_count_nonzero', 'SavingAccount_CreditCard_Payment_Transactions_var', 'SavingAccount_Transfer_Out_Transactions_sum', 'SavingAccount_Transfer_Out_Transactions_max', 'SavingAccount_Transfer_Out_Transactions_min', 'SavingAccount_Transfer_Out_Transactions_mean', 'SavingAccount_Transfer_Out_Transactions_nunique', 'SavingAccount_Transfer_Out_Transactions_count_nonzero', 'SavingAccount_Transfer_Out_Transactions_var', 'SavingAccount_DebitCard_Spend_Transactions_sum', 'SavingAccount_DebitCard_Spend_Transactions_max', 'SavingAccount_DebitCard_Spend_Transactions_min', 'SavingAccount_DebitCard_Spend_Transactions_mean', 'SavingAccount_DebitCard_Spend_Transactions_nunique', 'SavingAccount_DebitCard_Spend_Transactions_count_nonzero', 'SavingAccount_DebitCard_Spend_Transactions_var', 'SavingAccount_Transactions_Transactions_sum', 'SavingAccount_Transactions_Transactions_max', 'SavingAccount_Transactions_Transactions_min', 'SavingAccount_Transactions_Transactions_mean', 'SavingAccount_Transactions_Transactions_nunique', 'SavingAccount_Transactions_Transactions_count_nonzero', 'SavingAccount_Credits_Transactions_sum', 'SavingAccount_Credits_Transactions_max', 'SavingAccount_Credits_Transactions_min', 'SavingAccount_Credits_Transactions_mean', 'SavingAccount_Credits_Transactions_nunique', 'SavingAccount_Credits_Transactions_count_nonzero', 'SavingAccount_Debits_Transactions_sum', 'SavingAccount_Debits_Transactions_max', 'SavingAccount_Debits_Transactions_min', 'SavingAccount_Debits_Transactions_mean', 'SavingAccount_Debits_Transactions_nunique', 'SavingAccount_Debits_Transactions_count_nonzero', 'SavingAccount_Debits_Transactions_var', 'SavingAccount_Salary_Payment_Amount_sum', 'SavingAccount_Salary_Payment_Amount_max', 'SavingAccount_Salary_Payment_Amount_min', 'SavingAccount_Salary_Payment_Amount_mean', 'SavingAccount_Salary_Payment_Amount_count_nonzero', 'SavingAccount_Transfer_In_Amount_sum', 'SavingAccount_Transfer_In_Amount_max', 'SavingAccount_Transfer_In_Amount_min', 'SavingAccount_Transfer_In_Amount_mean', 'SavingAccount_Transfer_In_Amount_nunique', 'SavingAccount_Transfer_In_Amount_count_nonzero', 'SavingAccount_ATM_Extraction_Amount_sum', 'SavingAccount_ATM_Extraction_Amount_max', 'SavingAccount_ATM_Extraction_Amount_min', 'SavingAccount_ATM_Extraction_Amount_mean', 'SavingAccount_ATM_Extraction_Amount_nunique', 'SavingAccount_ATM_Extraction_Amount_count_nonzero', 'SavingAccount_ATM_Extraction_Amount_var', 'SavingAccount_CreditCard_Payment_Amount_sum', 'SavingAccount_CreditCard_Payment_Amount_max', 'SavingAccount_CreditCard_Payment_Amount_mean', 'SavingAccount_CreditCard_Payment_Amount_nunique', 'SavingAccount_CreditCard_Payment_Amount_count_nonzero', 'SavingAccount_CreditCard_Payment_Amount_var', 'SavingAccount_Transfer_Out_Amount_max', 'SavingAccount_Transfer_Out_Amount_mean', 'SavingAccount_Transfer_Out_Amount_count_nonzero', 'SavingAccount_Transfer_Out_Amount_var', 'SavingAccount_DebitCard_Spend_Amount_sum', 'SavingAccount_DebitCard_Spend_Amount_max', 'SavingAccount_DebitCard_Spend_Amount_min', 'SavingAccount_DebitCard_Spend_Amount_mean', 'SavingAccount_DebitCard_Spend_Amount_nunique', 'SavingAccount_DebitCard_Spend_Amount_count_nonzero', 'SavingAccount_Total_Amount_sum', 'SavingAccount_Total_Amount_max', 'SavingAccount_Total_Amount_min', 'SavingAccount_Total_Amount_mean', 'SavingAccount_Total_Amount_nunique', 'SavingAccount_Total_Amount_count_nonzero', 'SavingAccount_Credits_Amounts_sum', 'SavingAccount_Credits_Amounts_max', 'SavingAccount_Credits_Amounts_min', 'SavingAccount_Credits_Amounts_mean', 'SavingAccount_Credits_Amounts_nunique', 'SavingAccount_Credits_Amounts_count_nonzero', 'SavingAccount_Credits_Amounts_var', 'SavingAccount_Debits_Amounts_sum', 'SavingAccount_Debits_Amounts_max', 'SavingAccount_Debits_Amounts_min', 'SavingAccount_Debits_Amounts_mean', 'SavingAccount_Debits_Amounts_nunique', 'SavingAccount_Debits_Amounts_count_nonzero', 'SavingAccount_Debits_Amounts_var', 'Operations_HomeBanking_sum', 'Operations_HomeBanking_max', 'Operations_HomeBanking_min', 'Operations_HomeBanking_mean', 'Operations_HomeBanking_nunique', 'Operations_HomeBanking_count_nonzero', 'Operations_Mobile_sum', 'Operations_Mobile_max', 'Operations_Mobile_min', 'Operations_Mobile_mean', 'Operations_Mobile_nunique', 'Operations_Mobile_count_nonzero', 'CreditCard_Balance_ARG_sum', 'CreditCard_Balance_ARG_max', 'CreditCard_Balance_ARG_min', 'CreditCard_Balance_ARG_mean', 'CreditCard_Balance_ARG_nunique', 'CreditCard_Balance_ARG_count_nonzero', 'CreditCard_Balance_DOLLAR_max', 'CreditCard_Balance_DOLLAR_mean', 'CreditCard_Balance_DOLLAR_count_nonzero', 'CreditCard_Balance_DOLLAR_var', 'CreditCard_Total_Limit_sum', 'CreditCard_Total_Limit_max', 'CreditCard_Total_Limit_min', 'CreditCard_Total_Limit_mean', 'CreditCard_Total_Limit_count_nonzero', 'CreditCard_Total_Spending_sum', 'CreditCard_Total_Spending_max', 'CreditCard_Total_Spending_min', 'CreditCard_Total_Spending_mean', 'CreditCard_Total_Spending_nunique', 'CreditCard_Total_Spending_count_nonzero', 'CreditCard_Spending_1_Installment_sum', 'CreditCard_Spending_1_Installment_max', 'CreditCard_Spending_1_Installment_mean', 'CreditCard_Spending_1_Installment_count_nonzero', 'CreditCard_Spending_1_Installment_var', 'CreditCard_Spending_CrossBoarder_sum', 'CreditCard_Spending_CrossBoarder_max', 'CreditCard_Spending_CrossBoarder_mean', 'CreditCard_Spending_CrossBoarder_nunique', 'CreditCard_Spending_CrossBoarder_count_nonzero', 'CreditCard_Spending_CrossBoarder_var', 'CreditCard_Spending_Aut_Debits_sum', 'CreditCard_Spending_Aut_Debits_max', 'CreditCard_Spending_Aut_Debits_min', 'CreditCard_Spending_Aut_Debits_mean', 'CreditCard_Revolving_sum', 'CreditCard_Revolving_max', 'CreditCard_Revolving_mean', 'CreditCard_Revolving_count_nonzero', 'Operations_HomeBanking_y', 'Operations_Mobile_y', 'Porcent_Operations_HomeBanking', 'Porcent_Operations_Mobile', 'Porcent_Total_Insurances']
#drop features
ABT.drop(to_drop,axis=1, inplace=True)
ABT.shape
(23191, 87)
ABT.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 23191 entries, 0 to 23190 Data columns (total 87 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 client_id 23191 non-null int32 1 CreditCard_Premium 23191 non-null int32 2 CreditCard_Active 23191 non-null int32 3 Loan_Active 23191 non-null int32 4 Mortgage_Active 23191 non-null int32 5 DebitCard_Active 23191 non-null int32 6 Investment_Active 23191 non-null int32 7 Sex 23191 non-null int32 8 Client_Age_grp 23191 non-null int64 9 Mobile 23191 non-null int32 10 Email 23191 non-null int32 11 Region_AMBA Resto 23191 non-null int32 12 Region_BUENOS AIRES 23191 non-null int32 13 Region_CABA Centro/Norte 23191 non-null int32 14 Region_REGION CENTRO 23191 non-null int32 15 Region_REGION CUYO 23191 non-null int32 16 Region_REGION NORTE GRANDE ARGENTINO 23191 non-null int32 17 Region_REGION PATAGONICA 23191 non-null int32 18 SavingAccount_Active_ARG_Salary 23191 non-null int32 19 SavingAccount_Active_DOLLAR 23191 non-null int32 20 SavingAccount_Days_with_Credits 23191 non-null float64 21 SavingAccount_Salary_Payment_Transactions 23191 non-null float64 22 SavingAccount_Transfer_In_Transactions 23191 non-null float64 23 SavingAccount_ATM_Extraction_Transactions 23191 non-null float64 24 SavingAccount_CreditCard_Payment_Transactions 23191 non-null float64 25 SavingAccount_Transfer_Out_Transactions 23191 non-null float64 26 SavingAccount_DebitCard_Spend_Transactions 23191 non-null float64 27 SavingAccount_CreditCard_Payment_Amount 23191 non-null float64 28 SavingAccount_Transfer_Out_Amount 23191 non-null float64 29 SavingAccount_Total_Amount 23191 non-null float64 30 Operations_HomeBanking_x 23191 non-null float64 31 Operations_Mobile_x 23191 non-null float64 32 CreditCard_Balance_ARG 23191 non-null float64 33 CreditCard_Balance_DOLLAR 23191 non-null float64 34 CreditCard_Total_Limit 23191 non-null float64 35 CreditCard_Total_Spending 23191 non-null float64 36 CreditCard_Spending_Aut_Debits 23191 non-null float64 37 CreditCard_Revolving 23191 non-null float64 38 TGT 23191 non-null float64 39 SavingAccount_Days_with_Credits_count_nonzero 23191 non-null int64 40 SavingAccount_Days_with_Credits_var 23191 non-null float64 41 SavingAccount_Days_with_Debits_nunique 23191 non-null int64 42 SavingAccount_Days_with_Debits_var 23191 non-null float64 43 SavingAccount_Transfer_In_Transactions_min 23191 non-null float64 44 SavingAccount_ATM_Extraction_Transactions_nunique 23191 non-null int64 45 SavingAccount_Transactions_Transactions_var 23191 non-null float64 46 SavingAccount_Credits_Transactions_var 23191 non-null float64 47 SavingAccount_Salary_Payment_Amount_nunique 23191 non-null int64 48 SavingAccount_Salary_Payment_Amount_var 23191 non-null float64 49 SavingAccount_Transfer_In_Amount_var 23191 non-null float64 50 SavingAccount_CreditCard_Payment_Amount_min 23191 non-null float64 51 SavingAccount_Transfer_Out_Amount_sum 23191 non-null float64 52 SavingAccount_Transfer_Out_Amount_min 23191 non-null float64 53 SavingAccount_Transfer_Out_Amount_nunique 23191 non-null int64 54 SavingAccount_DebitCard_Spend_Amount_var 23191 non-null float64 55 SavingAccount_Total_Amount_var 23191 non-null float64 56 Operations_HomeBanking_var 23191 non-null float64 57 Operations_Mobile_var 23191 non-null float64 58 CreditCard_Balance_ARG_var 23191 non-null float64 59 CreditCard_Balance_DOLLAR_sum 23191 non-null float64 60 CreditCard_Balance_DOLLAR_min 23191 non-null float64 61 CreditCard_Balance_DOLLAR_nunique 23191 non-null int64 62 CreditCard_Total_Limit_nunique 23191 non-null int64 63 CreditCard_Total_Limit_var 23191 non-null float64 64 CreditCard_Total_Spending_var 23191 non-null float64 65 CreditCard_Spending_1_Installment_min 23191 non-null float64 66 CreditCard_Spending_1_Installment_nunique 23191 non-null int64 67 CreditCard_Spending_CrossBoarder_min 23191 non-null float64 68 CreditCard_Spending_Aut_Debits_nunique 23191 non-null int64 69 CreditCard_Spending_Aut_Debits_count_nonzero 23191 non-null int64 70 CreditCard_Spending_Aut_Debits_var 23191 non-null float64 71 CreditCard_Revolving_min 23191 non-null float64 72 CreditCard_Revolving_nunique 23191 non-null int64 73 CreditCard_Revolving_var 23191 non-null float64 74 Operations_Bank 23191 non-null float64 75 Operations_Terminal 23191 non-null float64 76 Operations_Ivr 23191 non-null float64 77 Operations_Telemarketer 23191 non-null float64 78 Operations_ATM 23191 non-null float64 79 TotalOperations 23191 non-null float64 80 Insurance_Life 23191 non-null int32 81 Insurance_Home 23191 non-null int32 82 Insurance_Accidents 23191 non-null int32 83 Insurance_Mobile 23191 non-null int32 84 Insurance_ATM 23191 non-null int32 85 Insurance_Unemployment 23191 non-null int32 86 TotalInsurances 23191 non-null int32 dtypes: float64(49), int32(26), int64(12) memory usage: 13.1 MB
ABT
| client_id | CreditCard_Premium | CreditCard_Active | Loan_Active | Mortgage_Active | DebitCard_Active | Investment_Active | Sex | Client_Age_grp | Mobile | ... | Operations_Telemarketer | Operations_ATM | TotalOperations | Insurance_Life | Insurance_Home | Insurance_Accidents | Insurance_Mobile | Insurance_ATM | Insurance_Unemployment | TotalInsurances | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5928737 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 30 | 1 | ... | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 6018047 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 60 | 1 | ... | 0.0 | 0.0 | 11.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 5359038 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 40 | 1 | ... | 0.0 | 7.0 | 83.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 6890812 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 40 | 1 | ... | 1.0 | 0.0 | 73.0 | 1 | 0 | 0 | 0 | 0 | 1 | 2 |
| 4 | 115383 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 70 | 0 | ... | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23186 | 6570413 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 30 | 1 | ... | 0.0 | 0.0 | 10.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 23187 | 6258895 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 30 | 1 | ... | 0.0 | 0.0 | 0.0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 23188 | 6397274 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 40 | 0 | ... | 0.0 | 0.0 | 0.0 | 1 | 0 | 1 | 0 | 0 | 0 | 2 |
| 23189 | 6007291 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 40 | 0 | ... | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 23190 | 6412619 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 60 | 1 | ... | 0.0 | 0.0 | 6.0 | 1 | 1 | 0 | 0 | 1 | 0 | 3 |
23191 rows × 87 columns
Feature selection¶
names = [x for x in ABT.columns if (x != 'client_id') & (x != 'TGT') ]
scaler = StandardScaler(copy=True)
scaler.fit(ABT[names]) # Entrena
scaled_est = scaler.transform(ABT[names]) # Standariza el total de la base
scaled_est = pd.DataFrame(scaled_est, columns=names, index=ABT.index)
ABT.drop(names, axis=1, inplace = True)
ABT = pd.concat((ABT, scaled_est), axis=1, sort=False)
ABT.head(5)
| client_id | TGT | CreditCard_Premium | CreditCard_Active | Loan_Active | Mortgage_Active | DebitCard_Active | Investment_Active | Sex | Client_Age_grp | ... | Operations_Telemarketer | Operations_ATM | TotalOperations | Insurance_Life | Insurance_Home | Insurance_Accidents | Insurance_Mobile | Insurance_ATM | Insurance_Unemployment | TotalInsurances | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5928737 | 0.0 | -0.361090 | -1.35619 | -0.383097 | -0.1513 | 0.702561 | -0.240698 | 0.862295 | -1.070796 | ... | -0.431961 | -0.251284 | -0.639048 | -0.489723 | -0.412265 | -0.43036 | -0.287629 | -0.410115 | -0.290693 | -0.675250 |
| 1 | 6018047 | 1.0 | 2.769392 | 0.73736 | -0.383097 | -0.1513 | 0.702561 | -0.240698 | 0.862295 | 1.159183 | ... | -0.431961 | -0.251284 | -0.196695 | -0.489723 | -0.412265 | -0.43036 | -0.287629 | -0.410115 | -0.290693 | -0.675250 |
| 2 | 5359038 | 1.0 | 2.769392 | 0.73736 | 2.610303 | -0.1513 | 0.702561 | -0.240698 | 0.862295 | -0.327469 | ... | -0.431961 | 1.489019 | 2.698708 | -0.489723 | -0.412265 | -0.43036 | -0.287629 | -0.410115 | -0.290693 | -0.675250 |
| 3 | 6890812 | 0.0 | -0.361090 | -1.35619 | 2.610303 | -0.1513 | 0.702561 | -0.240698 | 0.862295 | -0.327469 | ... | 0.312898 | -0.251284 | 2.296569 | 2.041969 | -0.412265 | -0.43036 | -0.287629 | -0.410115 | 3.440055 | 1.027172 |
| 4 | 115383 | 0.0 | -0.361090 | -1.35619 | -0.383097 | -0.1513 | -1.423364 | -0.240698 | 0.862295 | 1.902509 | ... | -0.431961 | -0.251284 | -0.639048 | -0.489723 | -0.412265 | -0.43036 | -0.287629 | -0.410115 | -0.290693 | -0.675250 |
5 rows × 87 columns
searchCV¶
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
target_column = 'TGT'
numerical_cols = [x for x in ABT.columns if (x != 'client_id') & (x != 'TGT')]
estimator = XGBClassifier(
objective= 'binary:logistic',
seed=42
)
parameters = {
'max_depth': np.arange(6, 10, 1),
'learning_rate': np.arange(0.01, 1, 0.05),
'gamma': np.arange(0.1, 2, 0.1),
'alpha': np.arange(0,10,1),
'lambda': np.arange(0,10,1),
'subsample': np.arange(0.1, 1, 0.1),
'n_estimators': np.arange(15, 20, 1)
}
#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100
grid_search = RandomizedSearchCV(
estimator=estimator,
param_distributions=parameters,
n_iter= n_HP_points_to_test,
scoring='roc_auc',
cv=3,
refit=True,
verbose=False)
grid_search.fit(ABT[numerical_cols], ABT[target_column])
RandomizedSearchCV(cv=3,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None, feature_types=None,
gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate...
'gamma': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
1.4, 1.5, 1.6, 1.7, 1.8, 1.9]),
'lambda': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
'learning_rate': array([0.01, 0.06, 0.11, 0.16, 0.21, 0.26, 0.31, 0.36, 0.41, 0.46, 0.51,
0.56, 0.61, 0.66, 0.71, 0.76, 0.81, 0.86, 0.91, 0.96]),
'max_depth': array([6, 7, 8, 9]),
'n_estimators': array([15, 16, 17, 18, 19]),
'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
scoring='roc_auc', verbose=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=3,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None, feature_types=None,
gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate...
'gamma': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
1.4, 1.5, 1.6, 1.7, 1.8, 1.9]),
'lambda': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
'learning_rate': array([0.01, 0.06, 0.11, 0.16, 0.21, 0.26, 0.31, 0.36, 0.41, 0.46, 0.51,
0.56, 0.61, 0.66, 0.71, 0.76, 0.81, 0.86, 0.91, 0.96]),
'max_depth': array([6, 7, 8, 9]),
'n_estimators': array([15, 16, 17, 18, 19]),
'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
scoring='roc_auc', verbose=False)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)from xgboost import plot_importance
plot_importance(grid_search.best_estimator_,max_num_features=25)
plt.show()
importance feature¶
feature_importance = grid_search.best_estimator_.get_booster().get_score()
keys = list(feature_importance.keys())
values = list(feature_importance.values())
data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by="score", ascending=False)
a=data.head(50)
a.to_csv(r'result/top_50_Features.csv', sep='|')
a = pd.read_csv(r'result/top_50_Features.csv', sep='|')
a.columns = ['column', 'order']
a
| column | order | |
|---|---|---|
| 0 | CreditCard_Balance_ARG_var | 62.0 |
| 1 | TotalOperations | 61.0 |
| 2 | CreditCard_Total_Limit | 58.0 |
| 3 | CreditCard_Total_Spending | 50.0 |
| 4 | SavingAccount_Total_Amount | 48.0 |
| 5 | Operations_Bank | 44.0 |
| 6 | Client_Age_grp | 44.0 |
| 7 | Operations_Terminal | 42.0 |
| 8 | CreditCard_Total_Spending_var | 40.0 |
| 9 | Operations_HomeBanking_var | 40.0 |
| 10 | CreditCard_Revolving_min | 38.0 |
| 11 | CreditCard_Total_Limit_var | 38.0 |
| 12 | CreditCard_Revolving | 37.0 |
| 13 | CreditCard_Spending_Aut_Debits_var | 36.0 |
| 14 | SavingAccount_Transactions_Transactions_var | 35.0 |
| 15 | SavingAccount_Total_Amount_var | 34.0 |
| 16 | CreditCard_Balance_ARG | 33.0 |
| 17 | CreditCard_Revolving_var | 31.0 |
| 18 | CreditCard_Spending_Aut_Debits | 30.0 |
| 19 | SavingAccount_Days_with_Credits_var | 29.0 |
| 20 | SavingAccount_Credits_Transactions_var | 27.0 |
| 21 | CreditCard_Balance_DOLLAR_sum | 24.0 |
| 22 | TotalInsurances | 23.0 |
| 23 | Operations_Telemarketer | 21.0 |
| 24 | Operations_Mobile_var | 21.0 |
| 25 | CreditCard_Active | 20.0 |
| 26 | SavingAccount_Transfer_In_Amount_var | 19.0 |
| 27 | SavingAccount_DebitCard_Spend_Transactions | 19.0 |
| 28 | SavingAccount_DebitCard_Spend_Amount_var | 17.0 |
| 29 | SavingAccount_Salary_Payment_Amount_var | 17.0 |
| 30 | CreditCard_Spending_1_Installment_min | 17.0 |
| 31 | SavingAccount_Days_with_Debits_var | 17.0 |
| 32 | Operations_ATM | 16.0 |
| 33 | SavingAccount_Active_ARG_Salary | 15.0 |
| 34 | SavingAccount_Transfer_Out_Amount_sum | 15.0 |
| 35 | SavingAccount_Salary_Payment_Transactions | 15.0 |
| 36 | CreditCard_Spending_Aut_Debits_nunique | 15.0 |
| 37 | Operations_HomeBanking_x | 15.0 |
| 38 | DebitCard_Active | 14.0 |
| 39 | Loan_Active | 14.0 |
| 40 | SavingAccount_Days_with_Debits_nunique | 13.0 |
| 41 | SavingAccount_CreditCard_Payment_Amount | 12.0 |
| 42 | Operations_Ivr | 11.0 |
| 43 | CreditCard_Spending_1_Installment_nunique | 10.0 |
| 44 | Region_REGION CUYO | 10.0 |
| 45 | CreditCard_Spending_Aut_Debits_count_nonzero | 10.0 |
| 46 | Mobile | 10.0 |
| 47 | SavingAccount_Days_with_Credits_count_nonzero | 10.0 |
| 48 | Region_BUENOS AIRES | 9.0 |
| 49 | Region_REGION CENTRO | 9.0 |
Bivariate Analysis¶
def plot_numerical_data(data_col, target_col = 'TGT', _df = ABT, bins=None, qs = None):
print (data_col)
df = _df.copy()
if (bins == None):
df['rank'] = round(df[data_col].rank(pct=True) * 9)
else:
df['rank'] = pd.cut(df[data_col], bins)
if (qs != None):
df['rank'] = pd.qcut(df[data_col], qs, duplicates='drop')
rank = pd.DataFrame(
df.groupby(['rank'])[data_col].agg([np.min, np.max, np.average])
).reset_index()
groupped = pd.DataFrame(
df.groupby(['rank'])[['client_id']].agg('nunique', np.sum).astype('int64')
).reset_index()
groupped.columns = ['rank', '# of clients']
target = pd.DataFrame(
df.groupby(['rank'])[[target_col]].agg(np.sum).astype('int64')
).reset_index()
target.columns = ['rank', '# of target clients']
merged = rank.merge(groupped, how='left').merge(target, how='left')
merged['target_p'] = (
(merged['# of target clients'] / merged['# of clients']) * 100
)
display(merged)
# Plot
merged['# of clients'].plot(kind='bar')
merged['target_p'].plot(secondary_y=True, color='g')
plt.show()
return merged
a
| column | order | |
|---|---|---|
| 0 | CreditCard_Balance_ARG_var | 62.0 |
| 1 | TotalOperations | 61.0 |
| 2 | CreditCard_Total_Limit | 58.0 |
| 3 | CreditCard_Total_Spending | 50.0 |
| 4 | SavingAccount_Total_Amount | 48.0 |
| 5 | Operations_Bank | 44.0 |
| 6 | Client_Age_grp | 44.0 |
| 7 | Operations_Terminal | 42.0 |
| 8 | CreditCard_Total_Spending_var | 40.0 |
| 9 | Operations_HomeBanking_var | 40.0 |
| 10 | CreditCard_Revolving_min | 38.0 |
| 11 | CreditCard_Total_Limit_var | 38.0 |
| 12 | CreditCard_Revolving | 37.0 |
| 13 | CreditCard_Spending_Aut_Debits_var | 36.0 |
| 14 | SavingAccount_Transactions_Transactions_var | 35.0 |
| 15 | SavingAccount_Total_Amount_var | 34.0 |
| 16 | CreditCard_Balance_ARG | 33.0 |
| 17 | CreditCard_Revolving_var | 31.0 |
| 18 | CreditCard_Spending_Aut_Debits | 30.0 |
| 19 | SavingAccount_Days_with_Credits_var | 29.0 |
| 20 | SavingAccount_Credits_Transactions_var | 27.0 |
| 21 | CreditCard_Balance_DOLLAR_sum | 24.0 |
| 22 | TotalInsurances | 23.0 |
| 23 | Operations_Telemarketer | 21.0 |
| 24 | Operations_Mobile_var | 21.0 |
| 25 | CreditCard_Active | 20.0 |
| 26 | SavingAccount_Transfer_In_Amount_var | 19.0 |
| 27 | SavingAccount_DebitCard_Spend_Transactions | 19.0 |
| 28 | SavingAccount_DebitCard_Spend_Amount_var | 17.0 |
| 29 | SavingAccount_Salary_Payment_Amount_var | 17.0 |
| 30 | CreditCard_Spending_1_Installment_min | 17.0 |
| 31 | SavingAccount_Days_with_Debits_var | 17.0 |
| 32 | Operations_ATM | 16.0 |
| 33 | SavingAccount_Active_ARG_Salary | 15.0 |
| 34 | SavingAccount_Transfer_Out_Amount_sum | 15.0 |
| 35 | SavingAccount_Salary_Payment_Transactions | 15.0 |
| 36 | CreditCard_Spending_Aut_Debits_nunique | 15.0 |
| 37 | Operations_HomeBanking_x | 15.0 |
| 38 | DebitCard_Active | 14.0 |
| 39 | Loan_Active | 14.0 |
| 40 | SavingAccount_Days_with_Debits_nunique | 13.0 |
| 41 | SavingAccount_CreditCard_Payment_Amount | 12.0 |
| 42 | Operations_Ivr | 11.0 |
| 43 | CreditCard_Spending_1_Installment_nunique | 10.0 |
| 44 | Region_REGION CUYO | 10.0 |
| 45 | CreditCard_Spending_Aut_Debits_count_nonzero | 10.0 |
| 46 | Mobile | 10.0 |
| 47 | SavingAccount_Days_with_Credits_count_nonzero | 10.0 |
| 48 | Region_BUENOS AIRES | 9.0 |
| 49 | Region_REGION CENTRO | 9.0 |
most_important=a['column']
for columna in most_important:
plot_numerical_data(columna)
CreditCard_Balance_ARG_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 1.0 | -0.333423 | -0.333423 | -0.333423 | 7363 | 622 | 8.447644 |
| 1 | 3.0 | -0.333423 | -0.329075 | -0.332007 | 1655 | 450 | 27.190332 |
| 2 | 4.0 | -0.329074 | -0.311315 | -0.321289 | 2577 | 1069 | 41.482344 |
| 3 | 5.0 | -0.311312 | -0.272232 | -0.294380 | 2577 | 1090 | 42.297245 |
| 4 | 6.0 | -0.272174 | -0.179269 | -0.232096 | 2577 | 1074 | 41.676368 |
| 5 | 7.0 | -0.179199 | 0.072656 | -0.077905 | 2576 | 1085 | 42.119565 |
| 6 | 8.0 | 0.072684 | 1.204171 | 0.459300 | 2577 | 1049 | 40.706248 |
| 7 | 9.0 | 1.205259 | 24.458540 | 3.263167 | 1289 | 384 | 29.790535 |
TotalOperations
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 1.0 | -0.639048 | -0.639048 | -0.639048 | 5666 | 575 | 10.148253 |
| 1 | 2.0 | -0.598834 | -0.598834 | -0.598834 | 1545 | 257 | 16.634304 |
| 2 | 3.0 | -0.558620 | -0.518406 | -0.540265 | 1985 | 381 | 19.193955 |
| 3 | 4.0 | -0.478192 | -0.397764 | -0.433633 | 2767 | 694 | 25.081316 |
| 4 | 5.0 | -0.357550 | -0.277122 | -0.321452 | 2179 | 689 | 31.620009 |
| 5 | 6.0 | -0.236909 | -0.035839 | -0.149538 | 2479 | 941 | 37.958854 |
| 6 | 7.0 | 0.004375 | 0.567370 | 0.241806 | 2766 | 1182 | 42.733189 |
| 7 | 8.0 | 0.607584 | 2.135713 | 1.225947 | 2513 | 1313 | 52.248309 |
| 8 | 9.0 | 2.175927 | 10.138283 | 3.206693 | 1291 | 791 | 61.270333 |
CreditCard_Total_Limit
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 1.0 | -0.878796 | -0.878796 | -0.878796 | 7427 | 554 | 7.459270 |
| 1 | 3.0 | -0.869794 | -0.446705 | -0.554437 | 1748 | 546 | 31.235698 |
| 2 | 4.0 | -0.374690 | -0.302674 | -0.328121 | 1599 | 589 | 36.835522 |
| 3 | 5.0 | -0.158644 | -0.014614 | -0.081112 | 3851 | 1528 | 39.678006 |
| 4 | 6.0 | 0.021394 | 0.201432 | 0.129444 | 1311 | 589 | 44.927536 |
| 5 | 7.0 | 0.273447 | 0.561508 | 0.402769 | 3442 | 1475 | 42.852992 |
| 6 | 8.0 | 0.849568 | 1.641735 | 1.109193 | 2373 | 990 | 41.719343 |
| 7 | 9.0 | 2.001811 | 4.162266 | 2.878371 | 1440 | 552 | 38.333333 |
CreditCard_Total_Spending
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -2.069368 | -0.650194 | -0.842358 | 18 | 3 | 16.666667 |
| 1 | 2.0 | -0.649929 | -0.649929 | -0.649929 | 9377 | 837 | 8.926096 |
| 2 | 4.0 | -0.649781 | -0.439504 | -0.546846 | 2201 | 741 | 33.666515 |
| 3 | 5.0 | -0.439478 | -0.174241 | -0.308974 | 2576 | 1067 | 41.420807 |
| 4 | 6.0 | -0.174133 | 0.155760 | -0.017973 | 2577 | 1156 | 44.858362 |
| 5 | 7.0 | 0.156280 | 0.680816 | 0.387786 | 2576 | 1197 | 46.467391 |
| 6 | 8.0 | 0.680919 | 2.010680 | 1.194128 | 2577 | 1189 | 46.138921 |
| 7 | 9.0 | 2.011431 | 4.315257 | 3.164614 | 1289 | 633 | 49.107836 |
SavingAccount_Total_Amount
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -0.535720 | -0.535720 | -0.535720 | 11770 | 2409 | 20.467290 |
| 1 | 5.0 | -0.535719 | -0.353948 | -0.458817 | 2402 | 532 | 22.148210 |
| 2 | 6.0 | -0.353872 | -0.113505 | -0.246878 | 2577 | 712 | 27.629026 |
| 3 | 7.0 | -0.113304 | 0.487357 | 0.127233 | 2576 | 1060 | 41.149068 |
| 4 | 8.0 | 0.487399 | 3.069775 | 1.447382 | 2577 | 1385 | 53.744664 |
| 5 | 9.0 | 3.071361 | 3.092456 | 3.092357 | 1289 | 725 | 56.245151 |
Operations_Bank
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 3.0 | -0.339982 | -0.339982 | -0.339982 | 17488 | 4236 | 24.222324 |
| 1 | 7.0 | 0.105446 | 0.105446 | 0.105446 | 2475 | 1032 | 41.696970 |
| 2 | 8.0 | 0.550874 | 1.441731 | 0.841044 | 2014 | 904 | 44.885799 |
| 3 | 9.0 | 1.887159 | 42.421137 | 3.287287 | 1214 | 651 | 53.624382 |
Client_Age_grp
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -1.962787 | -1.962787 | -1.962787 | 989 | 249 | 25.176946 |
| 1 | 1.0 | -1.070796 | -1.070796 | -1.070796 | 5308 | 1214 | 22.871138 |
| 2 | 4.0 | -0.327469 | -0.327469 | -0.327469 | 6563 | 1660 | 25.293311 |
| 3 | 6.0 | 0.415857 | 0.415857 | 0.415857 | 5149 | 1796 | 34.880559 |
| 4 | 7.0 | 1.159183 | 1.159183 | 1.159183 | 2122 | 776 | 36.569274 |
| 5 | 8.0 | 1.530846 | 1.530846 | 1.530846 | 1745 | 694 | 39.770774 |
| 6 | 9.0 | 1.902509 | 1.902509 | 1.902509 | 1315 | 434 | 33.003802 |
Operations_Terminal
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 3.0 | -0.485396 | -0.485396 | -0.485396 | 14773 | 3178 | 21.512218 |
| 1 | 6.0 | -0.245225 | -0.245225 | -0.245225 | 1741 | 605 | 34.750144 |
| 2 | 7.0 | -0.005054 | 0.715460 | 0.344768 | 2992 | 1123 | 37.533422 |
| 3 | 8.0 | 0.955631 | 1.676145 | 1.169663 | 2490 | 1161 | 46.626506 |
| 4 | 9.0 | 1.916316 | 24.732580 | 3.057481 | 1195 | 756 | 63.263598 |
CreditCard_Total_Spending_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -0.364309 | -0.364309 | -0.364309 | 8555 | 738 | 8.626534 |
| 1 | 3.0 | -0.364309 | -0.363583 | -0.364076 | 463 | 150 | 32.397408 |
| 2 | 4.0 | -0.363579 | -0.340502 | -0.354216 | 2577 | 992 | 38.494373 |
| 3 | 5.0 | -0.340495 | -0.289687 | -0.318348 | 2577 | 1111 | 43.112146 |
| 4 | 6.0 | -0.289648 | -0.173521 | -0.239913 | 2577 | 1133 | 43.965852 |
| 5 | 7.0 | -0.173507 | 0.116739 | -0.055659 | 2576 | 1082 | 42.003106 |
| 6 | 8.0 | 0.116846 | 1.379620 | 0.563281 | 2577 | 1096 | 42.530074 |
| 7 | 9.0 | 1.380478 | 16.479469 | 3.358022 | 1289 | 521 | 40.418929 |
Operations_HomeBanking_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -0.379385 | -0.379385 | -0.379385 | 12506 | 2933 | 23.452743 |
| 1 | 5.0 | -0.326504 | -0.326504 | -0.326504 | 1995 | 564 | 28.270677 |
| 2 | 6.0 | -0.294776 | -0.167862 | -0.256302 | 2021 | 580 | 28.698664 |
| 3 | 7.0 | -0.167862 | 0.181151 | -0.046336 | 2766 | 968 | 34.996385 |
| 4 | 8.0 | 0.181151 | 1.524321 | 0.662021 | 2610 | 1173 | 44.942529 |
| 5 | 9.0 | 1.524321 | 16.806844 | 3.336611 | 1293 | 605 | 46.790410 |
CreditCard_Revolving_min
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -6.168433 | -0.712710 | -2.280568 | 1288 | 571 | 44.332298 |
| 1 | 1.0 | -0.712662 | -0.086175 | -0.278155 | 2577 | 1104 | 42.840512 |
| 2 | 2.0 | -0.086037 | -0.026004 | -0.042562 | 2577 | 998 | 38.727202 |
| 3 | 3.0 | -0.026000 | -0.020023 | -0.021423 | 2574 | 823 | 31.973582 |
| 4 | 4.0 | -0.020021 | -0.019922 | -0.019969 | 571 | 143 | 25.043783 |
| 5 | 6.0 | -0.019921 | -0.019921 | -0.019921 | 11110 | 2301 | 20.711071 |
| 6 | 8.0 | -0.019919 | 1.108721 | 0.387373 | 1205 | 427 | 35.435685 |
| 7 | 9.0 | 1.112892 | 6.128592 | 2.781178 | 1289 | 456 | 35.376261 |
CreditCard_Total_Limit_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 3.0 | -0.216976 | -0.216976 | -0.216976 | 16387 | 5216 | 31.830109 |
| 1 | 6.0 | -0.216952 | -0.195942 | -0.206782 | 335 | 78 | 23.283582 |
| 2 | 7.0 | -0.195438 | -0.009670 | -0.111716 | 2603 | 683 | 26.238955 |
| 3 | 8.0 | -0.008997 | 0.644553 | 0.214651 | 2574 | 623 | 24.203574 |
| 4 | 9.0 | 0.644553 | 29.465396 | 2.603049 | 1292 | 223 | 17.260062 |
CreditCard_Revolving
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -5.612467 | -0.336828 | -0.883084 | 1288 | 528 | 40.993789 |
| 1 | 1.0 | -0.336769 | -0.295570 | -0.302413 | 2575 | 974 | 37.825243 |
| 2 | 2.0 | -0.295569 | -0.295435 | -0.295494 | 900 | 302 | 33.555556 |
| 3 | 4.0 | -0.295434 | -0.295434 | -0.295434 | 13356 | 3180 | 23.809524 |
| 4 | 7.0 | -0.295432 | -0.122488 | -0.253519 | 1206 | 461 | 38.225539 |
| 5 | 8.0 | -0.122321 | 2.042820 | 0.750355 | 2577 | 947 | 36.748157 |
| 6 | 9.0 | 2.043695 | 5.021600 | 3.491048 | 1289 | 431 | 33.436773 |
CreditCard_Spending_Aut_Debits_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 3.0 | -0.184095 | -0.184095 | -0.184095 | 13308 | 2428 | 18.244665 |
| 1 | 5.0 | -0.184095 | -0.183724 | -0.183931 | 868 | 371 | 42.741935 |
| 2 | 6.0 | -0.183719 | -0.175336 | -0.180889 | 2574 | 1120 | 43.512044 |
| 3 | 7.0 | -0.175329 | -0.115869 | -0.155630 | 2575 | 1158 | 44.970874 |
| 4 | 8.0 | -0.115839 | 0.437216 | 0.060002 | 2577 | 1202 | 46.643384 |
| 5 | 9.0 | 0.439633 | 23.465088 | 2.576660 | 1289 | 544 | 42.203258 |
SavingAccount_Transactions_Transactions_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -0.324941 | -0.324941 | -0.324941 | 10935 | 2277 | 20.823045 |
| 1 | 4.0 | -0.319016 | -0.319016 | -0.319016 | 646 | 119 | 18.421053 |
| 2 | 5.0 | -0.315461 | -0.285836 | -0.302442 | 2609 | 702 | 26.906861 |
| 3 | 6.0 | -0.282281 | -0.211182 | -0.251483 | 2572 | 805 | 31.298600 |
| 4 | 7.0 | -0.211182 | 0.025817 | -0.118617 | 2564 | 999 | 38.962559 |
| 5 | 8.0 | 0.025817 | 1.344714 | 0.488846 | 2576 | 1274 | 49.456522 |
| 6 | 9.0 | 1.348269 | 18.682349 | 3.289422 | 1289 | 647 | 50.193949 |
SavingAccount_Total_Amount_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -0.335904 | -0.335904 | -0.335904 | 10684 | 2404 | 22.500936 |
| 1 | 4.0 | -0.335904 | -0.335896 | -0.335903 | 911 | 91 | 9.989023 |
| 2 | 5.0 | -0.335896 | -0.327482 | -0.333761 | 2577 | 578 | 22.429181 |
| 3 | 6.0 | -0.327476 | -0.274374 | -0.306968 | 2577 | 868 | 33.682577 |
| 4 | 7.0 | -0.274294 | -0.001548 | -0.171869 | 2576 | 1084 | 42.080745 |
| 5 | 8.0 | -0.001374 | 1.575777 | 0.513347 | 2577 | 1308 | 50.756694 |
| 6 | 9.0 | 1.585892 | 7.529617 | 3.619710 | 1289 | 490 | 38.013964 |
CreditCard_Balance_ARG
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -5.548031 | -0.653676 | -0.741103 | 766 | 144 | 18.798956 |
| 1 | 2.0 | -0.653675 | -0.653675 | -0.653675 | 7961 | 677 | 8.503957 |
| 2 | 3.0 | -0.653674 | -0.626485 | -0.645485 | 291 | 63 | 21.649485 |
| 3 | 4.0 | -0.626203 | -0.418643 | -0.519399 | 2577 | 955 | 37.058595 |
| 4 | 5.0 | -0.418577 | -0.188284 | -0.307576 | 2577 | 1081 | 41.948002 |
| 5 | 6.0 | -0.188089 | 0.130912 | -0.044256 | 2577 | 1148 | 44.547924 |
| 6 | 7.0 | 0.130944 | 0.686263 | 0.380433 | 2576 | 1120 | 43.478261 |
| 7 | 8.0 | 0.686605 | 2.065600 | 1.231719 | 2577 | 1133 | 43.965852 |
| 8 | 9.0 | 2.066048 | 4.240681 | 3.142321 | 1289 | 502 | 38.944919 |
CreditCard_Revolving_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -0.283414 | -0.283414 | -0.283414 | 9790 | 1830 | 18.692543 |
| 1 | 4.0 | -0.283414 | -0.283410 | -0.283413 | 1805 | 558 | 30.914127 |
| 2 | 5.0 | -0.283410 | -0.281129 | -0.282967 | 2577 | 1031 | 40.007761 |
| 3 | 6.0 | -0.281127 | -0.245192 | -0.268607 | 2577 | 1028 | 39.891347 |
| 4 | 7.0 | -0.245144 | -0.059331 | -0.177578 | 2576 | 995 | 38.625776 |
| 5 | 8.0 | -0.059146 | 1.110684 | 0.320401 | 2577 | 997 | 38.688397 |
| 6 | 9.0 | 1.120624 | 16.779817 | 3.366449 | 1289 | 384 | 29.790535 |
CreditCard_Spending_Aut_Debits
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -4.555431 | -0.508876 | -0.892626 | 18 | 4 | 22.222222 |
| 1 | 3.0 | -0.473121 | -0.473121 | -0.473121 | 13065 | 2183 | 16.708764 |
| 2 | 5.0 | -0.473078 | -0.370593 | -0.410499 | 1090 | 466 | 42.752294 |
| 3 | 6.0 | -0.370158 | -0.146712 | -0.283454 | 2576 | 1115 | 43.284161 |
| 4 | 7.0 | -0.146501 | 0.457383 | 0.121738 | 2576 | 1188 | 46.118012 |
| 5 | 8.0 | 0.457943 | 1.952927 | 1.027224 | 2577 | 1210 | 46.953822 |
| 6 | 9.0 | 1.955667 | 4.916441 | 3.424559 | 1289 | 657 | 50.969744 |
SavingAccount_Days_with_Credits_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -0.514025 | -0.514025 | -0.514025 | 11877 | 2531 | 21.310095 |
| 1 | 5.0 | -0.338605 | -0.233353 | -0.324791 | 2141 | 656 | 30.639888 |
| 2 | 6.0 | -0.233353 | 0.082402 | -0.115928 | 3101 | 1032 | 33.279587 |
| 3 | 7.0 | 0.082402 | 0.503409 | 0.260621 | 2331 | 850 | 36.465036 |
| 4 | 8.0 | 0.503409 | 1.766431 | 0.953721 | 2422 | 1120 | 46.242775 |
| 5 | 9.0 | 1.766431 | 13.554633 | 3.216469 | 1319 | 634 | 48.066717 |
SavingAccount_Credits_Transactions_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -0.466557 | -0.466557 | -0.466557 | 11555 | 2392 | 20.700995 |
| 1 | 4.0 | -0.386273 | -0.386273 | -0.386273 | 3 | 0 | 0.000000 |
| 2 | 5.0 | -0.386273 | -0.322046 | -0.359128 | 2663 | 760 | 28.539241 |
| 3 | 6.0 | -0.322046 | -0.129366 | -0.176838 | 2530 | 826 | 32.648221 |
| 4 | 7.0 | -0.129366 | 0.304165 | 0.102248 | 2573 | 976 | 37.932375 |
| 5 | 8.0 | 0.304165 | 1.733210 | 0.861197 | 2578 | 1224 | 47.478666 |
| 6 | 9.0 | 1.733210 | 14.177146 | 3.345793 | 1289 | 645 | 50.038790 |
CreditCard_Balance_DOLLAR_sum
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -13.787115 | -0.220613 | -1.725336 | 269 | 116 | 43.122677 |
| 1 | 4.0 | -0.220613 | -0.220613 | -0.220613 | 19933 | 5492 | 27.552300 |
| 2 | 8.0 | -0.220443 | 1.497041 | 0.372545 | 1700 | 700 | 41.176471 |
| 3 | 9.0 | 1.500672 | 13.345889 | 3.280270 | 1289 | 515 | 39.953452 |
TotalInsurances
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 3.0 | -0.675250 | -0.675250 | -0.675250 | 13686 | 3061 | 22.365921 |
| 1 | 6.0 | 0.175961 | 0.175961 | 0.175961 | 4333 | 1536 | 35.448881 |
| 2 | 8.0 | 1.027172 | 1.878383 | 1.347880 | 4244 | 1721 | 40.551367 |
| 3 | 9.0 | 2.729594 | 4.432016 | 2.972666 | 928 | 505 | 54.418103 |
Operations_Telemarketer
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 3.0 | -0.431961 | -0.431961 | -0.431961 | 16570 | 4252 | 25.660833 |
| 1 | 7.0 | 0.312898 | 0.312898 | 0.312898 | 3634 | 1291 | 35.525592 |
| 2 | 8.0 | 1.057756 | 1.057756 | 1.057756 | 1456 | 600 | 41.208791 |
| 3 | 9.0 | 1.802614 | 33.086659 | 2.926469 | 1531 | 680 | 44.415415 |
Operations_Mobile_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -0.241753 | -0.241753 | -0.241753 | 19430 | 5197 | 26.747298 |
| 1 | 8.0 | -0.193304 | 1.085745 | 0.180265 | 2472 | 964 | 38.996764 |
| 2 | 9.0 | 1.085745 | 14.951795 | 3.298401 | 1289 | 662 | 51.357642 |
CreditCard_Active
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -1.35619 | -1.35619 | -1.35619 | 8168 | 513 | 6.280607 |
| 1 | 6.0 | 0.73736 | 0.73736 | 0.73736 | 15023 | 6310 | 42.002263 |
SavingAccount_Transfer_In_Amount_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 3.0 | -0.375733 | -0.375733 | -0.375733 | 17025 | 4753 | 27.917768 |
| 1 | 7.0 | -0.375733 | -0.041883 | -0.275449 | 2300 | 616 | 26.782609 |
| 2 | 8.0 | -0.041748 | 2.611488 | 0.987892 | 2577 | 866 | 33.604967 |
| 3 | 9.0 | 2.612386 | 5.351821 | 3.479117 | 1289 | 588 | 45.616757 |
SavingAccount_DebitCard_Spend_Transactions
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -0.293537 | -0.293537 | -0.293537 | 20041 | 5096 | 25.427873 |
| 1 | 8.0 | -0.041914 | 1.719450 | 0.562755 | 1888 | 928 | 49.152542 |
| 2 | 9.0 | 1.971074 | 5.242178 | 3.819569 | 1262 | 799 | 63.312203 |
SavingAccount_DebitCard_Spend_Amount_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -0.313453 | -0.313453 | -0.313453 | 19147 | 4878 | 25.476576 |
| 1 | 7.0 | -0.313453 | -0.309404 | -0.311838 | 180 | 55 | 30.555556 |
| 2 | 8.0 | -0.309402 | 2.293676 | 0.490071 | 2575 | 1256 | 48.776699 |
| 3 | 9.0 | 2.295398 | 7.033519 | 3.720617 | 1289 | 634 | 49.185415 |
SavingAccount_Salary_Payment_Amount_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -0.154153 | -0.154153 | -0.154153 | 21791 | 6053 | 27.777523 |
| 1 | 8.0 | -0.154153 | -0.122156 | -0.143151 | 111 | 57 | 51.351351 |
| 2 | 9.0 | -0.122080 | 12.279220 | 2.618346 | 1289 | 713 | 55.314197 |
CreditCard_Spending_1_Installment_min
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -13.199359 | -0.207642 | -2.069425 | 146 | 55 | 37.671233 |
| 1 | 4.0 | -0.206982 | -0.206982 | -0.206982 | 18271 | 4499 | 24.623721 |
| 2 | 7.0 | -0.206321 | -0.095421 | -0.119688 | 1099 | 494 | 44.949955 |
| 3 | 8.0 | -0.094761 | 0.802558 | 0.155231 | 2386 | 1140 | 47.778709 |
| 4 | 9.0 | 0.803006 | 12.785396 | 2.982973 | 1289 | 635 | 49.262995 |
SavingAccount_Days_with_Debits_var
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -0.408663 | -0.408663 | -0.408663 | 12092 | 2688 | 22.229573 |
| 1 | 5.0 | -0.364147 | -0.272408 | -0.326267 | 1788 | 526 | 29.418345 |
| 2 | 6.0 | -0.272408 | -0.119121 | -0.249566 | 2545 | 831 | 32.652259 |
| 3 | 7.0 | -0.119121 | 0.204484 | -0.017488 | 2881 | 1047 | 36.341548 |
| 4 | 8.0 | 0.204484 | 1.570371 | 0.710316 | 2597 | 1180 | 45.437043 |
| 5 | 9.0 | 1.584065 | 12.524903 | 3.389561 | 1288 | 551 | 42.779503 |
Operations_ATM
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -0.251284 | -0.251284 | -0.251284 | 19651 | 5029 | 25.591573 |
| 1 | 8.0 | -0.002669 | 0.991790 | 0.261233 | 2228 | 1005 | 45.107720 |
| 2 | 9.0 | 1.240405 | 26.847729 | 3.320090 | 1312 | 789 | 60.137195 |
SavingAccount_Active_ARG_Salary
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -0.335318 | -0.335318 | -0.335318 | 20847 | 5453 | 26.157241 |
| 1 | 9.0 | 2.982243 | 2.982243 | 2.982243 | 2344 | 1370 | 58.447099 |
SavingAccount_Transfer_Out_Amount_sum
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -0.230216 | -0.230216 | -0.230216 | 20709 | 5532 | 26.713023 |
| 1 | 8.0 | -0.230131 | 1.197264 | 0.251976 | 1193 | 573 | 48.030176 |
| 2 | 9.0 | 1.197851 | 10.713349 | 3.465422 | 1289 | 718 | 55.702095 |
SavingAccount_Salary_Payment_Transactions
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -0.269507 | -0.269507 | -0.269507 | 21283 | 5583 | 26.232204 |
| 1 | 8.0 | 1.607974 | 1.607974 | 1.607974 | 930 | 585 | 62.903226 |
| 2 | 9.0 | 3.485454 | 5.362935 | 4.335888 | 978 | 655 | 66.973415 |
CreditCard_Spending_Aut_Debits_nunique
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 3.0 | -0.717120 | -0.717120 | -0.717120 | 13308 | 2428 | 18.244665 |
| 1 | 6.0 | -0.181371 | 0.354379 | 0.046653 | 4013 | 1613 | 40.194368 |
| 2 | 7.0 | 0.890128 | 0.890128 | 0.890128 | 1361 | 612 | 44.966936 |
| 3 | 8.0 | 1.425878 | 1.961627 | 1.806332 | 4509 | 2170 | 48.125970 |
Operations_HomeBanking_x
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 3.0 | -0.428762 | -0.428762 | -0.428762 | 16415 | 4053 | 24.690832 |
| 1 | 7.0 | -0.061738 | 0.305287 | 0.070739 | 3416 | 1126 | 32.962529 |
| 2 | 8.0 | 0.672311 | 1.773385 | 1.082601 | 2019 | 875 | 43.338286 |
| 3 | 9.0 | 2.140409 | 5.076606 | 3.438270 | 1341 | 769 | 57.345265 |
DebitCard_Active
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 1.0 | -1.423364 | -1.423364 | -1.423364 | 7664 | 1800 | 23.48643 |
| 1 | 6.0 | 0.702561 | 0.702561 | 0.702561 | 15527 | 5023 | 32.35010 |
Loan_Active
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -0.383097 | -0.383097 | -0.383097 | 20223 | 5470 | 27.048410 |
| 1 | 8.0 | 2.610303 | 2.610303 | 2.610303 | 2968 | 1353 | 45.586253 |
SavingAccount_Days_with_Debits_nunique
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -0.788464 | -0.788464 | -0.788464 | 12092 | 2688 | 22.229573 |
| 1 | 6.0 | 0.040048 | 0.040048 | 0.040048 | 4309 | 1346 | 31.236946 |
| 2 | 7.0 | 0.868561 | 0.868561 | 0.868561 | 3690 | 1386 | 37.560976 |
| 3 | 8.0 | 1.697073 | 1.697073 | 1.697073 | 2145 | 983 | 45.827506 |
| 4 | 9.0 | 2.525585 | 3.354098 | 2.634897 | 955 | 420 | 43.979058 |
SavingAccount_CreditCard_Payment_Amount
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 3.0 | -0.369675 | -0.369675 | -0.369675 | 17551 | 3639 | 20.733861 |
| 1 | 7.0 | -0.369673 | 0.187913 | -0.122015 | 1774 | 911 | 51.352875 |
| 2 | 8.0 | 0.188039 | 1.866350 | 0.827353 | 2577 | 1489 | 57.780365 |
| 3 | 9.0 | 1.869587 | 5.489941 | 3.547350 | 1289 | 784 | 60.822343 |
Operations_Ivr
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -0.198273 | -0.198273 | -0.198273 | 18778 | 5125 | 27.292576 |
| 1 | 8.0 | 0.206850 | 0.206850 | 0.206850 | 2747 | 1017 | 37.022206 |
| 2 | 9.0 | 0.611973 | 39.908903 | 1.893728 | 1666 | 681 | 40.876351 |
CreditCard_Spending_1_Installment_nunique
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -0.857834 | -0.857834 | -0.857834 | 11364 | 1686 | 14.836325 |
| 1 | 5.0 | -0.362342 | -0.362342 | -0.362342 | 2059 | 803 | 38.999514 |
| 2 | 6.0 | 0.133151 | 0.628644 | 0.374648 | 3568 | 1452 | 40.695067 |
| 3 | 7.0 | 1.124137 | 1.124137 | 1.124137 | 1784 | 802 | 44.955157 |
| 4 | 8.0 | 1.619630 | 1.619630 | 1.619630 | 4416 | 2080 | 47.101449 |
Region_REGION CUYO
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -0.275860 | -0.275860 | -0.275860 | 21551 | 6334 | 29.390748 |
| 1 | 9.0 | 3.625032 | 3.625032 | 3.625032 | 1640 | 489 | 29.817073 |
CreditCard_Spending_Aut_Debits_count_nonzero
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -0.909830 | -0.909830 | -0.909830 | 12293 | 2000 | 16.269422 |
| 1 | 5.0 | -0.565618 | 0.811232 | 0.200570 | 1474 | 485 | 32.903664 |
| 2 | 7.0 | 1.155444 | 1.155444 | 1.155444 | 9424 | 4338 | 46.031409 |
Mobile
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 1.0 | -2.721880 | -2.721880 | -2.721880 | 2758 | 647 | 23.459028 |
| 1 | 5.0 | 0.367393 | 0.367393 | 0.367393 | 20433 | 6176 | 30.225615 |
SavingAccount_Days_with_Credits_count_nonzero
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 2.0 | -1.026614 | -1.026614 | -1.026614 | 10409 | 2188 | 21.020271 |
| 1 | 4.0 | -0.673649 | -0.320684 | -0.535876 | 1158 | 243 | 20.984456 |
| 2 | 5.0 | 0.032281 | 0.738212 | 0.480955 | 2257 | 625 | 27.691626 |
| 3 | 7.0 | 1.091177 | 1.091177 | 1.091177 | 9367 | 3767 | 40.215651 |
Region_BUENOS AIRES
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 3.0 | -0.656852 | -0.656852 | -0.656852 | 16201 | 4685 | 28.917968 |
| 1 | 8.0 | 1.522412 | 1.522412 | 1.522412 | 6990 | 2138 | 30.586552 |
Region_REGION CENTRO
| rank | min | max | average | # of clients | # of target clients | target_p | |
|---|---|---|---|---|---|---|---|
| 0 | 4.0 | -0.496008 | -0.496008 | -0.496008 | 18612 | 5707 | 30.663013 |
| 1 | 8.0 | 2.016096 | 2.016096 | 2.016096 | 4579 | 1116 | 24.372134 |
Select the main features¶
selected_features = list(a['column'][:50])
selected_features += ['client_id', 'TGT']
selected_features
['CreditCard_Balance_ARG_var', 'TotalOperations', 'CreditCard_Total_Limit', 'CreditCard_Total_Spending', 'SavingAccount_Total_Amount', 'Operations_Bank', 'Client_Age_grp', 'Operations_Terminal', 'CreditCard_Total_Spending_var', 'Operations_HomeBanking_var', 'CreditCard_Revolving_min', 'CreditCard_Total_Limit_var', 'CreditCard_Revolving', 'CreditCard_Spending_Aut_Debits_var', 'SavingAccount_Transactions_Transactions_var', 'SavingAccount_Total_Amount_var', 'CreditCard_Balance_ARG', 'CreditCard_Revolving_var', 'CreditCard_Spending_Aut_Debits', 'SavingAccount_Days_with_Credits_var', 'SavingAccount_Credits_Transactions_var', 'CreditCard_Balance_DOLLAR_sum', 'TotalInsurances', 'Operations_Telemarketer', 'Operations_Mobile_var', 'CreditCard_Active', 'SavingAccount_Transfer_In_Amount_var', 'SavingAccount_DebitCard_Spend_Transactions', 'SavingAccount_DebitCard_Spend_Amount_var', 'SavingAccount_Salary_Payment_Amount_var', 'CreditCard_Spending_1_Installment_min', 'SavingAccount_Days_with_Debits_var', 'Operations_ATM', 'SavingAccount_Active_ARG_Salary', 'SavingAccount_Transfer_Out_Amount_sum', 'SavingAccount_Salary_Payment_Transactions', 'CreditCard_Spending_Aut_Debits_nunique', 'Operations_HomeBanking_x', 'DebitCard_Active', 'Loan_Active', 'SavingAccount_Days_with_Debits_nunique', 'SavingAccount_CreditCard_Payment_Amount', 'Operations_Ivr', 'CreditCard_Spending_1_Installment_nunique', 'Region_REGION CUYO', 'CreditCard_Spending_Aut_Debits_count_nonzero', 'Mobile', 'SavingAccount_Days_with_Credits_count_nonzero', 'Region_BUENOS AIRES', 'Region_REGION CENTRO', 'client_id', 'TGT']
ABT_Model_Select = ABT[selected_features].copy()
Split in Train and Test¶
ABT_Model_Select.shape
(23191, 52)
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(ABT_Model_Select, test_size=0.3, random_state=42, stratify=ABT_Model_Select['TGT']);
X_train.TGT.value_counts()
TGT 0.0 11457 1.0 4776 Name: count, dtype: int64
X_test.TGT.value_counts()
TGT 0.0 4911 1.0 2047 Name: count, dtype: int64
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
target_column = 'TGT'
numerical_cols = [x for x in ABT_Model_Select.columns if (x != 'client_id') & (x != 'TGT')]
estimator = XGBClassifier(
objective= 'binary:logistic',
seed=42
)
parameters = {
'max_depth': np.arange(6, 10, 1),
'learning_rate': np.arange(0.01, 1, 0.05),
'gamma': np.arange(0.1, 2, 0.1),
'alpha': np.arange(0,10,1),
'lambda': np.arange(0,10,1),
'subsample': np.arange(0.1, 1, 0.1),
'n_estimators': np.arange(15, 20, 1)
}
cross_val = StratifiedKFold(n_splits=3)
grid_search = GridSearchCV(
estimator=estimator,
param_grid=parameters,
scoring = 'roc_auc',
n_jobs = 4,
cv = cross_val,
verbose=True
)
#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100
grid_search = RandomizedSearchCV(
estimator=estimator,
param_distributions=parameters,
n_iter= n_HP_points_to_test,
scoring='roc_auc',
cv= cross_val,
refit= True,
verbose= True)
grid_search.fit(X_train[numerical_cols], X_train[target_column])
Fitting 3 folds for each of 100 candidates, totalling 300 fits
RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None, feature_types=None,
gamma=None, grow_policy=None,
impor...
'gamma': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
1.4, 1.5, 1.6, 1.7, 1.8, 1.9]),
'lambda': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
'learning_rate': array([0.01, 0.06, 0.11, 0.16, 0.21, 0.26, 0.31, 0.36, 0.41, 0.46, 0.51,
0.56, 0.61, 0.66, 0.71, 0.76, 0.81, 0.86, 0.91, 0.96]),
'max_depth': array([6, 7, 8, 9]),
'n_estimators': array([15, 16, 17, 18, 19]),
'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
scoring='roc_auc', verbose=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None, feature_types=None,
gamma=None, grow_policy=None,
impor...
'gamma': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
1.4, 1.5, 1.6, 1.7, 1.8, 1.9]),
'lambda': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
'learning_rate': array([0.01, 0.06, 0.11, 0.16, 0.21, 0.26, 0.31, 0.36, 0.41, 0.46, 0.51,
0.56, 0.61, 0.66, 0.71, 0.76, 0.81, 0.86, 0.91, 0.96]),
'max_depth': array([6, 7, 8, 9]),
'n_estimators': array([15, 16, 17, 18, 19]),
'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
scoring='roc_auc', verbose=True)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)best_parameters = grid_search.best_estimator_
best_parameters.get_xgb_params()
{'objective': 'binary:logistic',
'base_score': None,
'booster': None,
'colsample_bylevel': None,
'colsample_bynode': None,
'colsample_bytree': None,
'device': None,
'eval_metric': None,
'gamma': 1.7000000000000002,
'grow_policy': None,
'interaction_constraints': None,
'learning_rate': 0.26,
'max_bin': None,
'max_cat_threshold': None,
'max_cat_to_onehot': None,
'max_delta_step': None,
'max_depth': 7,
'max_leaves': None,
'min_child_weight': None,
'monotone_constraints': None,
'multi_strategy': None,
'n_jobs': None,
'num_parallel_tree': None,
'random_state': None,
'reg_alpha': None,
'reg_lambda': None,
'sampling_method': None,
'scale_pos_weight': None,
'subsample': 0.8,
'tree_method': None,
'validate_parameters': None,
'verbosity': None,
'seed': 42,
'lambda': 0,
'alpha': 9}
Modelling¶
#create model
model = XGBClassifier(objective='binary:logistic',
booster='gbtree',
seed=42,
gamma=1.7000000000000002,
learning_rate=0.26,
max_depth=7,
subsample=0.8,
n_estimators=19,
alpha=9,
)
Train¶
# fit model
model.fit(X_train[numerical_cols], X_train[target_column])
XGBClassifier(alpha=9, base_score=None, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=1.7000000000000002, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.26, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=7, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=19, n_jobs=None,
num_parallel_tree=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(alpha=9, base_score=None, booster='gbtree', callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=1.7000000000000002, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.26, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=7, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=19, n_jobs=None,
num_parallel_tree=None, ...)from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(X_train[target_column], model.predict(X_train[numerical_cols])) * 100
print("Accuracy of Model: ",accuracy)
print(classification_report(X_train[target_column], model.predict(X_train[numerical_cols])))
pd.crosstab(X_train[target_column], model.predict(X_train[numerical_cols]))
Accuracy of Model: 81.12486909382123
precision recall f1-score support
0.0 0.84 0.90 0.87 11457
1.0 0.72 0.59 0.65 4776
accuracy 0.81 16233
macro avg 0.78 0.75 0.76 16233
weighted avg 0.80 0.81 0.81 16233
| col_0 | 0 | 1 |
|---|---|---|
| TGT | ||
| 0.0 | 10368 | 1089 |
| 1.0 | 1975 | 2801 |
Test¶
y_pred = model.predict(X_test[numerical_cols])
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(X_test[target_column], y_pred) * 100
print("Accuracy of Model: ",accuracy)
print(classification_report(X_test[target_column], y_pred))
pd.crosstab(X_test[target_column], y_pred)
Accuracy of Model: 79.04570278815751
precision recall f1-score support
0.0 0.82 0.90 0.86 4911
1.0 0.68 0.54 0.60 2047
accuracy 0.79 6958
macro avg 0.75 0.72 0.73 6958
weighted avg 0.78 0.79 0.78 6958
| col_0 | 0 | 1 |
|---|---|---|
| TGT | ||
| 0.0 | 4396 | 515 |
| 1.0 | 943 | 1104 |
Deciles¶
# results..
from sklearn.metrics import mean_squared_error
probabilities_train = model.predict_proba(X_train[numerical_cols])
probabilities = model.predict_proba(X_test[numerical_cols])
y_pred = model.predict(X_test[numerical_cols])
a = pd.DataFrame(X_train[['client_id', target_column]], columns=['TGT', 'idx'])
a = a.reset_index()
b = pd.DataFrame(probabilities_train[:,1], columns=['Prob1'])
result = pd.concat([a, b], axis=1)
result['porc'] = result['Prob1'].rank(pct=True) * 100
len(probabilities[:,1])
result.loc[result['porc'].between(0, 10, inclusive='neither'), 'decil'] = '10'
result.loc[result['porc'].between(10, 20, inclusive='both'), 'decil'] = '9'
result.loc[result['porc'].between(20, 30, inclusive='neither'), 'decil'] = '8'
result.loc[result['porc'].between(30, 40, inclusive='both'), 'decil'] = '7'
result.loc[result['porc'].between(40, 50, inclusive='neither'), 'decil'] = '6'
result.loc[result['porc'].between(50, 60, inclusive='both'), 'decil'] = '5'
result.loc[result['porc'].between(60, 70, inclusive='neither'), 'decil'] = '4'
result.loc[result['porc'].between(70, 80, inclusive='both'), 'decil'] = '3'
result.loc[result['porc'].between(80, 90, inclusive='neither'), 'decil'] = '2'
result.loc[result['porc'].between(90, 101, inclusive='both'), 'decil'] = '1'
print(result.decil.value_counts().sort_index())
print(result[result.TGT == 1].decil.value_counts().sort_index())
a = result.groupby('decil')['Prob1'].agg('min')
print(a.sort_index())
decil 1 1624 10 1691 2 1623 3 1623 4 1624 5 1623 6 1623 7 1624 8 1623 9 1555 Name: count, dtype: int64 decil 1 1354 10 5 2 1048 3 815 4 594 5 423 6 308 7 159 8 53 9 17 Name: count, dtype: int64 decil 1 0.688578 10 0.012251 2 0.539833 3 0.426468 4 0.332262 5 0.251477 6 0.164156 7 0.075242 8 0.031342 9 0.014199 Name: Prob1, dtype: float32
##############################################
# test
a = pd.DataFrame(X_test[[target_column, 'client_id']], columns=['TGT', 'idx'])
a = a.reset_index()
b = pd.DataFrame(probabilities[:,1], columns=['Prob1'])
result = pd.concat([a, b], axis=1)
result['porc'] = result['Prob1'].rank(pct=True) * 100
result['decil'] = np.where(result.Prob1 >= 0.688578 , 1,
np.where((result.Prob1 >= 0.539833 ) & (result.Prob1 < 0.688578 ), 2,
np.where((result.Prob1 >= 0.426468 ) & (result.Prob1 < 0.539833 ) , 3,
np.where((result.Prob1 >= 0.332262 ) & (result.Prob1 < 0.426468 ), 4,
np.where((result.Prob1 >= 0.251477 ) & (result.Prob1 < 0.332262 ), 5,
np.where((result.Prob1 >= 0.164156 ) & (result.Prob1 < 0.251477 ), 6,
np.where((result.Prob1 >= 0.075242 ) & (result.Prob1 < 0.164156 ) , 7,
np.where((result.Prob1 >= 0.031342 ) & (result.Prob1 < 0.075242 ), 8,
np.where((result.Prob1 >= 0.014199 ) & (result.Prob1 < 0.031342 ), 9,
10)))))))))
print("Total")
print(result.decil.value_counts().sort_index())
print("Buenos")
print(result[result.TGT == 1].decil.value_counts())
Total decil 1 694 2 702 3 697 4 676 5 664 6 703 7 697 8 711 9 672 10 742 Name: count, dtype: int64 Buenos decil 1 531 2 452 3 340 4 278 5 173 6 149 7 89 8 28 9 4 10 3 Name: count, dtype: int64
Performance¶
y_test = X_test['TGT']
y_pred = model.predict(X_test[numerical_cols])
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
[[4374 537] [ 943 1104]]
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()
threshold = 0.45
y_pred = (model.predict_proba(X_test[numerical_cols])[:, 1] > threshold).astype('float')
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()
threshold = 0.42
y_pred = (model.predict_proba(X_test[numerical_cols])[:, 1] > threshold).astype('float')
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()
# ROC
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt
import numpy as np
yPred = y_pred
yScore = result['Prob1']
yTest = result['TGT']
areaBajoCurvaRoc = roc_auc_score(yTest, yScore)
accuracy = accuracy_score(yTest, yPred)
fpr, tpr, _ = roc_curve(yTest, yScore)
plt.plot(fpr, tpr)
plt.plot([0,1])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.annotate('Area bajo la curva ROC : {}'.format(areaBajoCurvaRoc), (-0.02,0.99))
plt.annotate('Accuracy : {}'.format(accuracy), (-0.02,0.94))
plt.show()
display()
pVar = 'Prob1'
result['porc'] = result[pVar].rank(pct=True) * 100
len(probabilities[:,1])
result.loc[result['porc'].between(0, 10, inclusive='neither'), 'decil'] = '10'
result.loc[result['porc'].between(10, 20, inclusive='both'), 'decil'] = '9'
result.loc[result['porc'].between(20, 30, inclusive='neither'), 'decil'] = '8'
result.loc[result['porc'].between(30, 40, inclusive='both'), 'decil'] = '7'
result.loc[result['porc'].between(40, 50, inclusive='neither'), 'decil'] = '6'
result.loc[result['porc'].between(50, 60, inclusive='both'), 'decil'] = '5'
result.loc[result['porc'].between(60, 70, inclusive='neither'), 'decil'] = '4'
result.loc[result['porc'].between(70, 80, inclusive='both'), 'decil'] = '3'
result.loc[result['porc'].between(80, 90, inclusive='neither'), 'decil'] = '2'
result.loc[result['porc'].between(90, 101, inclusive='both'), 'decil'] = '1'
a = pd.DataFrame(result.decil.value_counts().reset_index())
a.columns = ['index','decil']
b = pd.DataFrame(result[result.TGT == 1].decil.value_counts().reset_index())
b.columns = ['index','decil']
b
| index | decil | |
|---|---|---|
| 0 | 1 | 531 |
| 1 | 2 | 450 |
| 2 | 3 | 342 |
| 3 | 4 | 283 |
| 4 | 5 | 178 |
| 5 | 6 | 143 |
| 6 | 7 | 86 |
| 7 | 8 | 27 |
| 8 | 9 | 4 |
| 9 | 10 | 3 |
c = a.merge(b, how='left', on='index')
c['TGT_%'] = (c['decil_y'] /c['decil_x'])*100
c
| index | decil_x | decil_y | TGT_% | |
|---|---|---|---|---|
| 0 | 10 | 742 | 3 | 0.404313 |
| 1 | 8 | 697 | 27 | 3.873745 |
| 2 | 4 | 696 | 283 | 40.660920 |
| 3 | 1 | 696 | 531 | 76.293103 |
| 4 | 2 | 696 | 450 | 64.655172 |
| 5 | 3 | 696 | 342 | 49.137931 |
| 6 | 5 | 696 | 178 | 25.574713 |
| 7 | 6 | 695 | 143 | 20.575540 |
| 8 | 7 | 695 | 86 | 12.374101 |
| 9 | 9 | 649 | 4 | 0.616333 |